Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I. Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Read in dependencies

library(ggplot2)
package ‘ggplot2’ was built under R version 3.6.2
library(treeio)
Registered S3 method overwritten by 'treeio':
  method     from
  root.phylo ape 
library(ggtree)
ggtree v2.5.0.991  For help: https://yulab-smu.top/treedata-book/

If you use ggtree in published research, please cite the most appropriate paper(s):

- Guangchuang Yu. Using ggtree to visualize data on tree-like structures. Current Protocols in Bioinformatics, 2020, 69:e96. doi:10.1002/cpbi.96
- Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution 2018, 35(12):3041-3043. doi:10.1093/molbev/msy194
- Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
library(ggnewscale)
package ‘ggnewscale’ was built under R version 3.6.2
library(plyr)
library(dplyr)
package ‘dplyr’ was built under R version 3.6.2
Attaching package: ‘dplyr’

The following objects are masked from ‘package:plyr’:

    arrange, count, desc, failwith, id, mutate, rename, summarise, summarize

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ tibble  3.0.3     ✓ purrr   0.3.4
✓ tidyr   1.1.0     ✓ stringr 1.4.0
✓ readr   1.3.1     ✓ forcats 0.5.0
package ‘tibble’ was built under R version 3.6.2package ‘tidyr’ was built under R version 3.6.2package ‘purrr’ was built under R version 3.6.2── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::arrange()   masks plyr::arrange()
x purrr::compact()   masks plyr::compact()
x dplyr::count()     masks plyr::count()
x tidyr::expand()    masks ggtree::expand()
x dplyr::failwith()  masks plyr::failwith()
x dplyr::filter()    masks stats::filter()
x dplyr::id()        masks plyr::id()
x dplyr::lag()       masks stats::lag()
x dplyr::mutate()    masks plyr::mutate()
x dplyr::rename()    masks plyr::rename()
x dplyr::summarise() masks plyr::summarise()
x dplyr::summarize() masks plyr::summarize()
library(phytools)
package ‘phytools’ was built under R version 3.6.2Loading required package: ape
package ‘ape’ was built under R version 3.6.2
Attaching package: ‘ape’

The following object is masked from ‘package:ggtree’:

    rotate

The following object is masked from ‘package:treeio’:

    drop.tip

Loading required package: maps

Attaching package: ‘maps’

The following object is masked from ‘package:purrr’:

    map

The following object is masked from ‘package:plyr’:

    ozone


Attaching package: ‘phytools’

The following object is masked from ‘package:treeio’:

    read.newick
library(randomcoloR)
library(RColorBrewer)
library(lubridate)
package ‘lubridate’ was built under R version 3.6.2
Attaching package: ‘lubridate’

The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union
library(readxl)
library(ggforce)
package ‘ggforce’ was built under R version 3.6.2
library(ggstance)
package ‘ggstance’ was built under R version 3.6.2
Attaching package: ‘ggstance’

The following objects are masked from ‘package:ggplot2’:

    geom_errorbarh, GeomErrorbarh
library(ggridges)
package ‘ggridges’ was built under R version 3.6.2
library(Cairo)
package ‘Cairo’ was built under R version 3.6.2
library(cowplot)

********************************************************
Note: As of version 1.0.0, cowplot does not change the
  default ggplot2 theme anymore. To recover the previous
  behavior, execute:
  theme_set(theme_cowplot())
********************************************************


Attaching package: ‘cowplot’

The following object is masked from ‘package:lubridate’:

    stamp
library(ggmap)
Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
Please cite ggmap if you use it! See citation("ggmap") for details.

Attaching package: ‘ggmap’

The following object is masked from ‘package:cowplot’:

    theme_nothing

The following objects are masked from ‘package:ggtree’:

    inset, theme_inset
library(CoordinateCleaner)
package ‘CoordinateCleaner’ was built under R version 3.6.2Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
library(gridExtra)

Attaching package: ‘gridExtra’

The following object is masked from ‘package:dplyr’:

    combine
library(hexbin)
library(emojifont)
library(scales)
package ‘scales’ was built under R version 3.6.2
Attaching package: ‘scales’

The following object is masked from ‘package:purrr’:

    discard

The following object is masked from ‘package:readr’:

    col_factor
library(pairsnp)
library(rPinecone)
replacing previous import ‘ape::ring’ by ‘igraph::ring’ when loading ‘rPinecone’replacing previous import ‘ape::mst’ by ‘igraph::mst’ when loading ‘rPinecone’replacing previous import ‘ape::edges’ by ‘igraph::edges’ when loading ‘rPinecone’replacing previous import ‘BMhyd::AICc’ by ‘phangorn::AICc’ when loading ‘rPinecone’replacing previous import ‘igraph::diversity’ by ‘phangorn::diversity’ when loading ‘rPinecone’
R.Version()
$platform
[1] "x86_64-apple-darwin15.6.0"

$arch
[1] "x86_64"

$os
[1] "darwin15.6.0"

$system
[1] "x86_64, darwin15.6.0"

$status
[1] ""

$major
[1] "3"

$minor
[1] "6.0"

$year
[1] "2019"

$month
[1] "04"

$day
[1] "26"

$`svn rev`
[1] "76424"

$language
[1] "R"

$version.string
[1] "R version 3.6.0 (2019-04-26)"

$nickname
[1] "Planting of a Tree"
print(sessionInfo())
R version 3.6.0 (2019-04-26)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.6

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib

locale:
[1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] rPinecone_0.1.0          pairsnp_0.1.0            scales_1.1.1             emojifont_0.5.3          hexbin_1.28.1           
 [6] gridExtra_2.3            CoordinateCleaner_2.0-17 ggmap_3.0.0              cowplot_1.0.0            Cairo_1.5-12.2          
[11] ggridges_0.5.3           ggstance_0.3.5           ggforce_0.3.2            readxl_1.3.1             lubridate_1.7.9         
[16] RColorBrewer_1.1-2       randomcoloR_1.1.0.1      phytools_0.7-47          maps_3.3.0               ape_5.4-1               
[21] forcats_0.5.0            stringr_1.4.0            purrr_0.3.4              readr_1.3.1              tidyr_1.1.0             
[26] tibble_3.0.3             tidyverse_1.3.0          dplyr_1.0.0              plyr_1.8.6               ggnewscale_0.4.5        
[31] ggtree_2.5.0.991         treeio_1.9.1             ggplot2_3.3.3           

loaded via a namespace (and not attached):
  [1] uuid_0.1-4              backports_1.1.8         BMhyd_1.2-8             fastmatch_1.1-0         igraph_1.2.5           
  [6] lazyeval_0.2.2          sp_1.4-2                rncl_0.8.4              digest_0.6.25           fansi_0.4.1            
 [11] magrittr_1.5            geoaxe_0.1.0            cluster_2.0.8           modelr_0.1.8            sysfonts_0.8.1         
 [16] prettyunits_1.1.1       jpeg_0.1-8.1            colorspace_1.4-1        blob_1.2.1              rvest_0.3.6            
 [21] haven_2.3.1             xfun_0.16               rgdal_1.5-16            crayon_1.3.4            jsonlite_1.7.0         
 [26] phylobase_0.8.10        phangorn_2.5.5          glue_1.4.1              polyclip_1.10-0         gtable_0.3.0           
 [31] geiger_2.0.7            V8_3.2.0                mvtnorm_1.1-1           oai_0.3.0               DBI_1.1.0              
 [36] Rcpp_1.0.5              showtextdb_3.0          plotrix_3.7-8           progress_1.2.2          units_0.6-7            
 [41] tidytree_0.3.3          subplex_1.6             deSolve_1.28            rgbif_3.3.0             animation_2.6          
 [46] httr_1.4.2              geosphere_1.5-10        ellipsis_0.3.1          XML_3.99-0.3            pkgconfig_2.0.3        
 [51] farver_2.0.3            dbplyr_1.4.4            conditionz_0.1.0        reshape2_1.4.4          tidyselect_1.1.0       
 [56] rlang_0.4.7             munsell_0.5.0           cellranger_1.1.0        tools_3.6.0             cli_2.0.2              
 [61] generics_0.0.2          ade4_1.7-15             broom_0.7.0             knitr_1.29              fs_1.4.2               
 [66] RgoogleMaps_1.4.5.3     showtext_0.9            nlme_3.1-139            whisker_0.4             aplot_0.0.6            
 [71] xml2_1.3.2              compiler_3.6.0          rstudioapi_0.11         curl_4.3                png_0.1-7              
 [76] e1071_1.7-3             reprex_0.3.0            clusterGeneration_1.3.4 tweenr_1.0.1            RNeXML_2.4.5           
 [81] stringi_1.4.6           rgeos_0.5-5             lattice_0.20-38         Matrix_1.2-17           classInt_0.4-3         
 [86] vctrs_0.3.2             pillar_1.4.6            lifecycle_0.2.0         BiocManager_1.30.10     combinat_0.0-8         
 [91] corpcor_1.6.9           data.table_1.12.8       bitops_1.0-6            raster_3.3-13           patchwork_1.0.1        
 [96] R6_2.4.1                KernSmooth_2.23-15      TreeSim_2.4             codetools_0.2-16        MASS_7.3-51.4          
[101] gtools_3.8.2            assertthat_0.2.1        proto_1.0.0             rjson_0.2.20            withr_2.2.0            
[106] rnaturalearth_0.1.0     mnormt_1.5-6            expm_0.999-5            parallel_3.6.0          hms_0.5.3              
[111] quadprog_1.5-8          grid_3.6.0              coda_0.19-3             class_7.3-15            rvcheck_0.1.8          
[116] Rtsne_0.15              sf_0.9-5                numDeriv_2016.8-1.1     scatterplot3d_0.3-41   

Read in data


# ML Tree (using raw sequences with minimal filters)
TPA.rawseq.ML.tree.file <- "TPA-uber.remasked.2020-11-10.lowcov75.SNPs.aln.renamed.treefile"

# ML tree (refined dataset)
TPA.MLtree.file <- "TPA-uber.remasked.2020-11-10.goodcov25.gubbins.SNPs.aln.renamed.treefile"

# Pyjar tree (refined dataset)
TPA.pyjar.file <- "TPA-uber.remasked.2020-11-10.goodcov25.gubbins.SNPs.aln.renamed.pyjar.tre"

# Multiple Sequence alignment of SNPs for ML tree/pyjar
TPA.MSA.SNPs.aln.file <- "TPA-uber.remasked.2020-11-10.goodcov25.gubbins.SNPs.renamed.aln"

# Pinecone clustering bootstrap data
pinecone.10.file <- "TPA-uber.bootstrapped-pinecone.10-3.2021-01-27.v2.pinecone.bootstrap.table.csv"

# Master Metadata spreadsheet
#TPA.meta1.file <- "/Users/mb29/Treponema/Expanded_Global_Sequencing/Global_Sequence_Collection_Info_update-03-2021.xlsx"
TPA.meta1.file <- "Supplementary_Data1_Sample-Metadata__03-2021.xlsx"


# Some population prevalence data
UK.stats.file <- "PHE_2019_UK_Syphilis-rate-per-100k-pop__02-11-2020.tsv"
BC.stats.file <- "BCCDC-Canada__BC-Syphilis-rate-per-100k-pop_2017___02-11-2020.tsv"

# BEAST Analyses
# Subsampled BEAST analysis 1
TPA.beast.subtree.file <- "TPA-uber.remasked.2020-11-10.gubbins.subsampled.2020-11-18.WGS.rendates.noInv_StrictCSkyline_combined.2020-11-23.consensus.tree"

beast.subtree.skyline.file <- "TPA-uber.remasked.2020-11-10.gubbins.subsampled.2020-11-18.WGS.rendates.noInv_StrictCSkyline_1.export_skyline_data.txt"

beast.subtree.skyline.lineage.file <- "TPA-uber_beast2_strict-skyline-500M_10pop_combined.lineages-data.tsv"


# Subsampled BEAST analysis 2 (repeat)
repeat.subsampled.skyline.tree.file <- "TPA-uber.remasked.2020-11-10.gubbins.subsampled.2.2020-11-23.WGS.rendates.noInv.Strict-Skyline-HKY_combined.2020-11-26.consensus.tree"

repeat.subsampled.skyline.data.file <- "TPA-uber.remasked.2020-11-10.gubbins.subsampled.2.2020-11-23.WGS.rendates.noInv.Strict-Skyline-HKY_1.skyline-data.tsv"
repeat.subsampled.skyline.lineages.data.file <- "TPA-uber.remasked.2020-11-10.gubbins.subsampled.2.2020-11-23.WGS.rendates.noInv.Strict-Skyline-HKY_1.skyline-lineages-data.tsv"

# Full size BEAST2 analysis
full.beast2.tree.file <- "TPA-uber_beast2_strict-skyline-500M_10pop_consensus.tree"

beast2.runs.filepath <- "./"

beast2.full.skyline.path <- "TPA-uber_beast2_strict-skyline-500M_10pop_combined.skyline-data.tsv"
beast2.full.lineages.path <- "TPA-uber_beast2_strict-skyline-500M_10pop_combined.lineages-data.tsv"
beast2.full.popdistro.path <- "TPA-uber_beast2_strict-skyline-500M_10pop_combined.pop-distributions_p100.txt"


beast2.pop.decline.file <- "TPA-uber_beast2_strict-skyline-500M_10pop_pop-decline.1990-2015.p50_population_change_distribution.csv"


beast2.pop.increase.file  <- "TPA-uber_beast2_strict-skyline-500M_10pop_pop-increase.1990-2015.p100_population_change_distribution.csv"

pop.decline.supporting.trees.file <- "TPA-uber_beast2_strict-skyline-500M_10pop_pop-decline.1990-2015.p50_trees_supporting.nex"

pop.decline.notsupporting.trees.file <- "TPA-uber_beast2_strict-skyline-500M_10pop_pop-decline.1990-2015.p50_trees_not_supporting.nex"


# Sublineage BEAST analysis
sublineage.skylines.filepath <- "./"
pop.distro.path <- "./" 
pop.distro.sublin.1.file <- "TPA-uber.remasked.2020-11-10.goodcov25.gubbins.WGS.sublineage.new.1.noinv.Strict-Skyline_combined.pop-expansion"
pop.distro.sublin.2.file <- "TPA-uber.remasked.2020-11-10.goodcov25.gubbins.WGS.sublineage.new.2.noinv.Strict-Skyline_combined.pop-expansion"
pop.distro.sublin.8.file <- "TPA-uber.remasked.2020-11-10.goodcov25.gubbins.WGS.sublineage.new.8.noinv.Strict-Skyline_combined.pop-expansion"

# Recombination analysis
recombination_event.file <- "Supplementary_Table_Recombination-Events_2021-03-25.xlsx"

# Tip Date Randomisation
random.tip.summary.file <- "clockRate.stats.csv"

# Macrolide resistance allele calls
TPA.global.compmapping.23s.file <-"competitive-mapping_combined-reports.all.2020-12-01.final.tsv"

# Pinecone assignments
pinecone.clusters.MLoriginal.file <- "Global-TPA.goodcov.rPinecone10-3.assignments_2020-11-11.csv"

WGS.site.positions.file <- "TPA-uber.remasked.2020-11-10.goodcov25.gubbins.SNPs.site-positions.txt"

# Location to save files to
Figure_output_directory <- "/Users/mb29/Papers/Global_Treponema_Uber-Paper_2020/Figures/Figure_Drafting/"

Read in trees

TPA.rawseq.ML.tree <- midpoint.root(read.tree(TPA.rawseq.ML.tree.file))
TPA.MLtree <- midpoint.root(read.tree(TPA.MLtree.file))
TPA.pyjar.tree <- midpoint.root(read.tree(TPA.pyjar.file))

make some shortcuts for plotting

y.theme.strip <- theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y= element_blank())
y.theme.strip.partial <- theme(axis.text.y = element_blank(), axis.ticks.y= element_blank())

x.theme.strip <- theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x= element_blank())
x.theme.strip.partial <- theme(axis.text.x = element_blank(), axis.ticks.x= element_blank())
x.theme.strip.labs <- theme(axis.text.x = element_blank(),axis.title.x = element_blank())

x.theme.axis.rotate <- theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

theme.text.size <- theme(text = element_text(size = 10))

'%notin%' <- Negate('%in%')

Filter metadata

# First do some cleaning
#TPA.meta1 <- readxl::read_excel(TPA.meta1.file,sheet="All_useable_T.pal_01-2020_refil")
TPA.meta1 <- readxl::read_excel(TPA.meta1.file,sheet="Supplementary_Data1_Sample-Meta")
TPA.meta1 <- subset(TPA.meta1, select=-c(A2058G,A2059G,TPA.pinecone.sublineage))


TPA.meta1.2 <- TPA.meta1[TPA.meta1$Sample_Name %in% TPA.rawseq.ML.tree$tip.label,]

TPA.meta1.2 <- TPA.meta1.2[TPA.meta1.2$Species=="TPA",]
#TPA.meta1.2 <- TPA.meta1.2[TPA.meta1.2$Study_Type=="TPA-Global",]
TPA.meta1.2 <- TPA.meta1.2[TPA.meta1.2$Duplicate!="Yes",]
#TPA.meta1.2 <- TPA.meta1.2[TPA.meta1.2$Sample_Year!="-",]


#TPA.meta1.2 <- TPA.meta1.2[TPA.meta1.2$`SKA_Alignment_terrible>50%?`=="No",]
TPA.meta1.2 <- TPA.meta1.2[!is.na(TPA.meta1.2$Sample_Name),]
#TPA.meta1.2 <- data.frame(TPA.meta1.2,stringsAsFactors = F)


TPA.meta1.2$Geo_Country <- gsub("\\_","\\ ",TPA.meta1.2$Geo_Country)

do some date parsing to create date groups

floor_5years  <- function(value){ return(value - value %% 5) }
ceiling_5years <- function(value){ return(round_to_5years(value)+5) }
round_to_5years <- function(value){ return(round(value / 5) * 5) }

TPA.meta1.2$Sample_5year.floor <- floor_5years(as.numeric(TPA.meta1.2$Sample_Year))
NAs introduced by coercion
TPA.meta1.2$Sample_5year.window <- paste0(floor_5years(as.numeric(TPA.meta1.2$Sample_Year)),"-",floor_5years(as.numeric(TPA.meta1.2$Sample_Year))+5)
NAs introduced by coercionNAs introduced by coercion
# Some samples have uncertain dates (up to 20-30 years uncertainty), but for the purposes of these plotting categories we'll use the centrepoint year
TPA.meta1.2$Sample_5year.window <- sapply(1:nrow(TPA.meta1.2), function(x) ifelse(TPA.meta1.2$Sample_Year[x]=="-",NA, ifelse(is.na(TPA.meta1.2$Sample_5year.window[x]),NA, ifelse(TPA.meta1.2$Sample_Year[x]=="1950-1980","1965-1970",ifelse(TPA.meta1.2$Sample_Year[x]=="1960-1980","1965-1970" ,ifelse(TPA.meta1.2$Sample_Year[x]=="1980-1999","1985-1990",TPA.meta1.2$Sample_5year.window[x]))))))

#TPA.meta1.2[TPA.meta1.2$Sample_5year.window=="NA-NA","Sample_5year.window"] <- NA

#View(TPA.meta1.2[,c("Sample_Name","Sample_Year","Sample_5year.window")])


# Make colour scheme for date window
TPA.5year.window.brewcols <- data.frame(window.5year=unique(TPA.meta1.2$Sample_5year.window), stringsAsFactors=F)
TPA.5year.window.brewcols$window.5year <- TPA.5year.window.brewcols[order(TPA.5year.window.brewcols$window.5year),]
TPA.5year.window.brewcols$window.5year <- factor(TPA.5year.window.brewcols$window.5year, levels=TPA.5year.window.brewcols$window.5year)
# set colour scale
TPA.5year.window.brewcols$window.5year.cols <- c("Black",brewer.pal(n=11,"RdYlBu"),"white")

#c("1910-1915","1950-1955","1965-1970,"1970-1975","1975-1980","1980-1985","1985-1990","1990-1995","2000-2005","2005-2010","2010-2015","2015-2020","NA")


# Also create a numeric date year for some calculations
TPA.meta1.2$Sample_Year.num <- as.numeric((ifelse(TPA.meta1.2$Sample_Year=="1950-1980","1965",ifelse(TPA.meta1.2$Sample_Year=="1960-1980","1970",TPA.meta1.2$Sample_Year))))
NAs introduced by coercion

Create a colour scheme for countries and continents

# Colouring for country
continental.country.cols.brew2 <- unique(TPA.meta1.2[,c("Geo_Country","Continent")])
continental.country.cols.brew2 <- continental.country.cols.brew2[order(continental.country.cols.brew2$Continent,continental.country.cols.brew2$Geo_Country),]

continental.country.cols.brew2$country.col <- c("#ec7014","#fec44f","#de2d26","#fb6a4a","#bdbdbd","#737373",brewer.pal(n=8,"Purples")[3:8],brewer.pal(n=8,"Blues")[3:8],brewer.pal(n=5,"Greens")[3:5],"#c51b8a","#8c510a")

# Colouring for Continent
continental.cols.brew2 <- data.frame(Continent=sort(unique(TPA.meta1.2$Continent)),stringsAsFactors=F)
continental.cols.brew2$continent.col <- c("#fec44f","#de2d26","#bdbdbd","#2171b5","#74c476","#c51b8a","#ec7014")

# Colouring for TPA Lineage
TPA_Lineage.cols <- data.frame(Lineage=sort(unique(TPA.meta1.2$TPA_Lineage)),stringsAsFactors=F)
#TPA_Lineage.cols$Lineage.col <- c("royalblue2", "grey40", "indianred1")
TPA_Lineage.cols$Lineage.col <- c("royalblue2", "indianred1")
#c("#436eee", "#666666","#ff6a6a")
TPA_Lineage.cols$Lineage <- factor(TPA_Lineage.cols$Lineage, levels=c("Nichols","SS14","outlier"))
#TPA_Lineage.cols$Lineage <- factor(TPA_Lineage.cols$Lineage, levels=c("Nichols","SS14"))


# Lineage Hexcodes
# royalblue2 #436eee
# indianred1 #ff6a6a


#TPA_Lineage.cols[order(TPA_Lineage.cols$Lineage,c("Nichols","SS14","outlier")),]

check labelling issues

TPA.rawseq.ML.tree$tip.label[TPA.rawseq.ML.tree$tip.label %notin% TPA.meta1.2$Sample_Name]
character(0)
TPA.meta1.2$Sample_Name[TPA.meta1.2$Sample_Name %notin% TPA.rawseq.ML.tree$tip.label]
character(0)
# There is one very low coverage sample (TPA_BCC144, 47% genome breadth, 7.9X mean coverage) with odd phylogenetic placement - it's SS14, but basal in this analysis. Since the coverage is so low, it's not possible to further investigate this, so classify it here as SS14.
TPA.meta1.2[(TPA.meta1.2$TPA_Lineage=="outlier"),"TPA_Lineage"] <- "SS14"
# Prepare tree
TPA.rawseq.ML.ggtree <- ggtree(TPA.rawseq.ML.tree,layout = "fan",open.angle = 20, right=T)
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
TPA.rawseq.ML.ggtree <- ggtree(TPA.rawseq.ML.tree,layout = "fan",open.angle = 15, right=T)
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
# Prepare country dataset
TPA.rawseq.countries.p <- data.frame(row.names=TPA.meta1.2$Sample_Name, Country=TPA.meta1.2$Geo_Country, stringsAsFactors = F)

# Prepare continent dataset
TPA.rawseq.continents.p <- data.frame(row.names=TPA.meta1.2$Sample_Name, Continent=TPA.meta1.2$Continent, stringsAsFactors = F)

# Prepare Major lineage dataset
TPA.rawseq.Lineage.p <- data.frame(row.names=TPA.meta1.2$Sample_Name, Lineage=TPA.meta1.2$TPA_Lineage, stringsAsFactors = F)
TPA.rawseq.ML.ggtree.tippoints <- TPA.rawseq.ML.ggtree %<+% data.frame(Sample_Name=rownames(TPA.rawseq.continents.p), Continent=TPA.rawseq.continents.p$Continent, stringsAsFactors = F) + 
  geom_tippoint(aes(color=Continent), size=0.5, alpha=0.5) + 
  scale_color_manual(name="Continent",values=continental.cols.brew2$continent.col, breaks=continental.cols.brew2$Continent) 

# Rescale colours
TPA.rawseq.ML.ggtree.tippoints <- TPA.rawseq.ML.ggtree.tippoints + new_scale_color()

# Plot continent strip
p.TPA.rawseq.ML.ggtree.tippoint.country.cont <- gheatmap(TPA.rawseq.ML.ggtree.tippoints,TPA.rawseq.continents.p, color=NULL,width=0.075,colnames_angle=-45,colnames_offset_y=0.02, hjust=0.0,font.size=2.25) + 
  scale_fill_manual(name="Continent",values=continental.cols.brew2$continent.col, breaks=continental.cols.brew2$Continent) + 
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right') +
  NULL
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
p.TPA.rawseq.ML.ggtree.tippoint.country.cont <- p.TPA.rawseq.ML.ggtree.tippoint.country.cont + new_scale_fill()



p.TPA.rawseq.ML.ggtree.tippoint.country.cont <- gheatmap(p.TPA.rawseq.ML.ggtree.tippoint.country.cont,TPA.rawseq.countries.p, color=NULL,width=0.075,offset=0.00001225, colnames_angle=-45,colnames_offset_y=0.02, hjust=0.0,font.size=2.25) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right') +
  NULL
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
p.TPA.rawseq.ML.ggtree.tippoint.country.cont <- p.TPA.rawseq.ML.ggtree.tippoint.country.cont + new_scale_fill()


# Add sublineage
p.TPA.rawseq.ML.ggtree.tippoint.country.cont <- gheatmap(p.TPA.rawseq.ML.ggtree.tippoint.country.cont,TPA.rawseq.Lineage.p, color=NULL,width=0.075,offset=0.00002425, colnames_angle=-45,colnames_offset_y=0.02, hjust=-0.0,font.size=2.25) + 
  scale_fill_manual(name="Lineage",values=TPA_Lineage.cols$Lineage.col, breaks=TPA_Lineage.cols$Lineage) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right') +
  NULL
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
p.TPA.rawseq.ML.ggtree.tippoint.country.cont <- p.TPA.rawseq.ML.ggtree.tippoint.country.cont + new_scale_fill()

# Plot tree
#p.TPA.rawseq.ML.ggtree.tippoint.country.cont

#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure1_Global-TPA.low-cov-MLtree_02-20.svg"), width = 1000, height = 1000,type="svg",units = "pt")
p.TPA.rawseq.ML.ggtree.tippoint.country.cont

#dev.off()
TPA.rawseq.country.counts <- TPA.meta1.2[TPA.meta1.2$Sample_Year!="-",] %>% dplyr::group_by(Sample_Year, Geo_Country) %>% 
  dplyr::summarise(Count=n())

TPA.rawseq.country.counts$Sample_Year <- ifelse(TPA.rawseq.country.counts$Sample_Year=="1960-1980","1970",TPA.rawseq.country.counts$Sample_Year)

TPA.rawseq.country.counts <- plyr::join(data.frame(Sample_Year=c(1912:2019),stringsAsFactors=F), TPA.rawseq.country.counts)
TPA.rawseq.country.counts <- TPA.rawseq.country.counts[!is.na(TPA.rawseq.country.counts$Geo_Country),]
TPA.rawseq.country.counts$Geo_Country <- factor(TPA.rawseq.country.counts$Geo_Country, levels=continental.country.cols.brew2$Geo_Country)


p.Country.temporal.bubbleplot <- ggplot(TPA.rawseq.country.counts, aes(Sample_Year, Geo_Country, colour=Geo_Country)) +
  geom_point(alpha=0.65, aes(size=Count)) + 
  #guides(colour=FALSE) +
  theme_light() +
  scale_size_area(max_size = 8,breaks=c(1,5,10,25,50,75,100)) +
  scale_color_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none') +
  coord_cartesian(xlim=c(1950,2020)) +
  labs(y="Country", x="Sample Year")
#p.Country.temporal.bubbleplot

p.Country.temporal.bubbleplot.legend <- get_legend(p.Country.temporal.bubbleplot + theme(legend.key.size = unit(0.65,"line"),legend.position='left'))
raw.country.counts <- TPA.meta1.2 %>% group_by(Geo_Country) %>% summarise(Count=n())
`summarise()` ungrouping output (override with `.groups` argument)
raw.country.counts$Geo_Country <- factor(raw.country.counts$Geo_Country,levels=continental.country.cols.brew2$Geo_Country)

p.Country.hbarplot <- ggplot(raw.country.counts, aes(Count,Geo_Country,fill=Geo_Country)) +
  geom_barh(stat="identity", position="stack", width=0.75) +
  theme_light() +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none') +
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  geom_text(data=raw.country.counts, aes((Count+30), Geo_Country,label=Count), size=2.5, inherit.aes = F) +
  labs(y="Country", x="Samples/Country") +
  coord_cartesian(xlim=c(0,625))
#p.Country.hbarplot  

plot together

grid.arrange(p.Country.temporal.bubbleplot, p.Country.hbarplot + y.theme.strip, ncol=2, widths=c(3,1))

Now, lets think about just what lineages are where

major.lineage.country.summary.simple <- TPA.meta1.2[TPA.meta1.2$TPA_Lineage %in% c("SS14","Nichols"),] %>% dplyr::group_by(TPA_Lineage,Geo_Country) %>% 
  dplyr::summarise(total.samples=n())
`summarise()` regrouping output by 'TPA_Lineage' (override with `.groups` argument)
major.lineage.country.summary.simple$Geo_Country <- factor(major.lineage.country.summary.simple$Geo_Country, levels=rev(sort(unique(major.lineage.country.summary.simple$Geo_Country)))) 

country.summary.simple <- TPA.meta1.2[TPA.meta1.2$TPA_Lineage %in% c("SS14","Nichols"),] %>% dplyr::group_by(Geo_Country) %>% 
  dplyr::summarise(total.samples=n())
`summarise()` ungrouping output (override with `.groups` argument)
country.summary.simple$Geo_Country <- factor(country.summary.simple$Geo_Country, levels=rev(sort(unique(country.summary.simple$Geo_Country)))) 
major.lineage.country.summary.simple$TPA_Lineage <- factor(major.lineage.country.summary.simple$TPA_Lineage, levels=unique(major.lineage.country.summary.simple$TPA_Lineage))

p.majorlineage.country.props <- ggplot(major.lineage.country.summary.simple, aes(Geo_Country, total.samples, fill=TPA_Lineage)) +
  geom_bar(stat="identity",position = position_fill(reverse = TRUE)) +
  theme_light() + 
  scale_fill_manual(values=c("royalblue2","indianred1")) + 
  scale_y_continuous(breaks=c(0,0.5,1)) +
  coord_flip() +
  theme.text.size +
  labs(y="Proportion", x="Country")
p.majorlineage.country.counts <- ggplot(major.lineage.country.summary.simple, aes(Geo_Country, total.samples, fill=TPA_Lineage)) + 
         geom_bar(stat="identity",position="stack") +
  theme_light() + 
  scale_fill_manual(values=c("royalblue2","indianred1")) +
  #scale_y_log10() +
  coord_flip() +
  labs(fill="TPA Lineage", y="Sample Count") + 
  geom_text(data=country.summary.simple, aes(Geo_Country, (total.samples+22), label=total.samples), size=2.5, inherit.aes = F) +
  theme.text.size
#p.majorlineage.country.counts
major.lineage.country.summary.simple2 <- major.lineage.country.summary.simple
major.lineage.country.summary.simple2$Geo_Country <- factor(major.lineage.country.summary.simple2$Geo_Country,levels=continental.country.cols.brew2$Geo_Country) 

p.majorlineage.country.props.reordered <- ggplot(major.lineage.country.summary.simple2, aes(Geo_Country, total.samples, fill=TPA_Lineage)) + 
  #geom_bar(stat="identity",position="fill", width=0.75) +
  geom_bar(stat="identity",position = position_fill(reverse=TRUE), width=0.75) +
  
  theme_light() + 
  scale_fill_manual(values=TPA_Lineage.cols$Lineage.col, breaks=TPA_Lineage.cols$Lineage) + 
  scale_y_continuous(breaks=c(0,0.5,1)) +
  coord_flip() +
  theme.text.size +
  labs(y="Lineage Proportion", x="Country", fill="TPA Lineage")
#p.majorlineage.country.props.reordered
#grid.arrange(p.Country.temporal.bubbleplot, p.Country.hbarplot + y.theme.strip,p.majorlineage.country.props.reordered + y.theme.strip + theme.text.size + theme(legend.key.size = unit(0.75,"line")), ncol=3, widths=c(6,2,2))

Major lineage bubbleplot timeline


TPA.majorlineage.counts <- TPA.meta1.2[(TPA.meta1.2$Sample_Year!="-" & TPA.meta1.2$TPA_Lineage!="outlier"),] %>% dplyr::group_by(Sample_Year,TPA_Lineage) %>% 
  dplyr::summarise(Count=n())

TPA.majorlineage.counts$Sample_Year <- ifelse(TPA.majorlineage.counts$Sample_Year=="1960-1980","1970",TPA.majorlineage.counts$Sample_Year)

TPA.majorlineage.counts <- plyr::join(data.frame(Sample_Year=c(1912:2019),stringsAsFactors=F), TPA.majorlineage.counts)
TPA.majorlineage.counts <- TPA.majorlineage.counts[!is.na(TPA.majorlineage.counts$TPA_Lineage),]
TPA.majorlineage.counts$TPA_Lineage <- factor(TPA.majorlineage.counts$TPA_Lineage, levels=rev(TPA_Lineage.cols$Lineage))


p.majorlineage.temporal.bubbleplot <- ggplot(TPA.majorlineage.counts, aes(Sample_Year, TPA_Lineage, colour=TPA_Lineage)) +
  geom_point(alpha=0.65, aes(size=Count)) + 
  theme_light() +
  scale_size_area(max_size = 8,breaks=c(1,5,10,25,50,75,100)) +
  #scale_color_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  scale_color_manual(name="Lineage",values=TPA_Lineage.cols$Lineage.col, breaks=TPA_Lineage.cols$Lineage) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none') +
  coord_cartesian(xlim=c(1950,2020)) +
  labs(y="Lineage", x="Sample Year")
p.majorlineage.temporal.bubbleplot


p.majorlineage.temporal.bubbleplot.legend <- get_legend(p.majorlineage.temporal.bubbleplot + theme(legend.key.size = unit(0.65,"line"),legend.position='left') + guides(size=FALSE))



# total counts
#TPA.majorlineage.counts.simple <- TPA.meta1.2[(TPA.meta1.2$Sample_Year!="-" & TPA.meta1.2$TPA_Lineage!="outlier"),] %>% dplyr::group_by(TPA_Lineage) %>% 
#  dplyr::summarise(Count=n())
TPA.majorlineage.counts.simple <- TPA.meta1.2 %>% dplyr::group_by(TPA_Lineage) %>% 
  dplyr::summarise(Count=n())



p.majorlineage.total.hbarplot <- ggplot(TPA.majorlineage.counts, aes(Count,TPA_Lineage, fill=TPA_Lineage)) +
  geom_barh(stat="identity", position="stack", width=0.65) + 
  theme_light() +
  scale_fill_manual(name="Lineage",values=TPA_Lineage.cols$Lineage.col, breaks=TPA_Lineage.cols$Lineage) +
  geom_text(data=TPA.majorlineage.counts.simple, aes((Count+30), TPA_Lineage, label=Count), size=2.5, inherit.aes = F) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none') +
  labs(y="Lineage", x="Count") + 
  coord_cartesian(xlim=c(0,625))
p.majorlineage.total.hbarplot

Look at sample dates and whether a sample was passaged or not

TPA.meta1.2$Sample_Year.2000era <- ifelse(TPA.meta1.2$Sample_Year.num<2000,"pre2000","post1999")
TPA.meta1.2[TPA.meta1.2$Sample_Name=="DAL-1","Sample_Year.2000era"] <- "pre2000"


# Proportion of samples before and after 2000
TPA.meta1.2[!is.na(TPA.meta1.2$Sample_Year.2000era),] %>% dplyr::group_by(Sample_Year.2000era) %>%
  dplyr::summarise(count=n()) %>% 
  dplyr::mutate(perc=(count/sum(count)*100))
`summarise()` ungrouping output (override with `.groups` argument)
# Proportion of clinicals before 2000
data.frame(TPA.meta1.2[(!is.na(TPA.meta1.2$Sample_Year.2000era) & TPA.meta1.2$Sample_Year.2000era=="pre2000"),] %>% dplyr::group_by(Direct_from_clin) %>%
               dplyr::summarise(count=n()) %>% 
               dplyr::mutate(perc=(count/sum(count)*100)),stringsAsFactors = F)
`summarise()` ungrouping output (override with `.groups` argument)
# Proportion of clinicals after 1999
data.frame(TPA.meta1.2[(!is.na(TPA.meta1.2$Sample_Year.2000era) & TPA.meta1.2$Sample_Year.2000era=="post1999"),] %>% dplyr::group_by(Direct_from_clin) %>%
               dplyr::summarise(count=n()) %>% 
               dplyr::mutate(perc=(count/sum(count)*100)),stringsAsFactors = F)
`summarise()` ungrouping output (override with `.groups` argument)

Do a map of sample distributions


#country.coords.subset1 <- data.frame(Geo_Country=unique(sublineage.country.summary.simple$Geo_Country))
country.coords.subset <- data.frame(Geo_Country=raw.country.counts$Geo_Country, stringsAsFactors = F)

country.coords.subset$name <- gsub("Czech Republic","Czechia",gsub("USA","United States",gsub("UK","United Kingdom",gsub("\\_","\\ ",country.coords.subset$Geo_Country))))

# Russia is very large - let's centre on Tuva instead (but keep the labelling for database consistency)
country.coords.subset$name <- gsub("Russia", "Tuva", country.coords.subset$name)

# Merge with published centroid locations and deduplicate
country.coords.subset <- plyr::join(country.coords.subset,CoordinateCleaner::countryref,by="name")

# Mexico centre's oddly - take the location of Mexico City instead
country.coords.subset[country.coords.subset$Geo_Country=="Mexico","centroid.lon"] <- country.coords.subset[country.coords.subset$Geo_Country=="Mexico","capital.lon"]
country.coords.subset[country.coords.subset$Geo_Country=="Mexico","centroid.lat"] <- country.coords.subset[country.coords.subset$Geo_Country=="Mexico","capital.lat"]

country.coords.subset <- country.coords.subset[!duplicated(country.coords.subset$name),c("Geo_Country","name","centroid.lon","centroid.lat")]

# Merge with country sample counts
country.coords.subset.counts <- plyr::join(country.coords.subset,raw.country.counts,by="Geo_Country")


# ggmap
world.gps.bounds <- c(left=-120, bottom=-45, right= 150, top= 72)
stamanmap.global1 <- ggmap::get_stamenmap(bbox=world.gps.bounds, maptype = "toner-lite", zoom=3)

# Reduce the intensity of the basemap (third of the alpha)
stamanmap.global1.attribs <- attributes(stamanmap.global1)
stamanmap.global1.transparent <- matrix(adjustcolor(stamanmap.global1, alpha.f = 0.5),nrow=nrow(stamanmap.global1))
attributes(stamanmap.global1.transparent) <- stamanmap.global1.attribs 


# Plot map with country sampling
stamanmap.global1.p <- ggmap(stamanmap.global1.transparent)
stamanmap.global1.p <- stamanmap.global1.p + 
  #geom_point(data=country.coords.subset.counts, aes(centroid.lon, centroid.lat, size=Count+0.5),alpha=0.50, show.legend = F) + 
  geom_point(data=country.coords.subset.counts, aes(centroid.lon, centroid.lat, size=Count, color=Geo_Country),alpha=0.8) +
  guides(colour=FALSE) +
  theme_light() +
  theme(legend.key.size = unit(0.65,"line"),legend.position='right', ) + guides(fill=guide_legend(ncol=3)) +
  theme.text.size +
  scale_color_manual(values=continental.country.cols.brew2$country.col,breaks=continental.country.cols.brew2$Geo_Country) +
  #scale_size_area(max_size = 12,breaks=c(1,5,10,25,50,100,200,400)) +
  #scale_size_binned(breaks=c(1,5,10,25,50,100,200,400)) +
  scale_size_binned(range = c(2, 10),breaks=c(5,20,50,100,200,400)) +
  labs(y="Latitude", x="Longitude", color="Country", size="Sample\nCount")



#Cairo::Cairo(file=paste0(Figure_output_directory, "Global-TPA.low-cov__Map-of-Country-distributions11-2020.svg"), width = 800, height = 500,type="svg",units = "pt")
stamanmap.global1.p

#dev.off()

#stamanmap.global1.p

Plot all together as a combined panel grid

p.bubble.legends.grid <- plot_grid(p.majorlineage.temporal.bubbleplot.legend, p.Country.temporal.bubbleplot.legend, ncol=1, rel_heights=c(1,3))

first_row_country.dist <- plot_grid(stamanmap.global1.p,labels=c('A'),ncol=1,label_size = 11,vjust=-0.25)

row2.3_column_1_country.dist <- plot_grid(p.Country.temporal.bubbleplot, p.majorlineage.temporal.bubbleplot, ncol=1, rel_heights=c(4,1), align=T, labels=c('B','E'),label_size = 11,vjust=-0.25)

row2.3_column_2_country.dist <- plot_grid(p.Country.hbarplot + y.theme.strip + coord_cartesian(x=c(0,620)), p.majorlineage.total.hbarplot + theme(legend.position="none") + y.theme.strip + theme.text.size + coord_cartesian(x=c(0,620)) + labs(x="Samples/Lineage"),ncol=1, rel_heights=c(4,1), align=T, labels=c('C','F'),label_size = 11,vjust=-0.25)

row2.3_column_3_country.dist <- plot_grid(p.majorlineage.country.props.reordered + y.theme.strip + theme.text.size + theme(legend.position="none"), NULL, ncol=1, rel_heights=c(4,1),labels=c('D',''),label_size = 11,vjust=-0.25)

row2.3_combine.columns_country.dist <- plot_grid(row2.3_column_1_country.dist, row2.3_column_2_country.dist, row2.3_column_3_country.dist,p.bubble.legends.grid,rel_widths=c(5,2,1,2), ncol=4)

gg_all_country.dist.complex <- plot_grid(first_row_country.dist, row2.3_combine.columns_country.dist, labels=c('', ''), ncol=1, rel_heights = c(6,5), scale=0.95)

#gg_all_country.dist.complex

#Cairo::Cairo(file=paste0(Figure_output_directory, "Figure1__Country-distros_complex_02-2021.svg"), width = 1000, height = 800,type="svg",units = "pt")
gg_all_country.dist.complex

#dev.off()

Now move onto detailed gubbins masked phylogeny and sublineage analysis

#TPA.MLtree
#TPA.pyjar.tree

#ggtree(TPA.MLtree)
#ggtree(TPA.pyjar.tree)

# Extract SNP distances from pyjar tree
edge.TPAgubbins <- data.frame(TPA.pyjar.tree$edge, edge_num=1:length(TPA.pyjar.tree$edge.length),stringsAsFactors = F)
colnames(edge.TPAgubbins)=c("parent", "node", "edge_num")
edge.TPAgubbins$SNPs <- TPA.pyjar.tree$edge.length

# now build tree
TPA.pyjar.treeplot <- ggtree(TPA.pyjar.tree) %<+% 
  edge.TPAgubbins + geom_text(aes(x=branch, label=SNPs, vjust=-.5),size=3,color="grey50") +
  #geom_tiplab(size=2,align=T,offset=.0001) +
  NULL
TPA.pyjar.treeplot 

Do some clustering using rPinecone (takes a long time, so write out to file and reimport)

#TPA.pyjar.tree.phylo <- as.phylo(TPA.pyjar.tree)
#pinecone.output <- rPinecone::pinecone(TPA.pyjar.tree.phylo,10,3) # standard approach used for TPA

######################## testing ########################
#pinecone.output.8.3 <- rPinecone::pinecone(TPA.pyjar.tree.phylo,8,3) # modifying to see the effect 8.3
#pinecone.output.12.3 <- rPinecone::pinecone(TPA.pyjar.tree.phylo,12,3) # modifying to see the effect 12.3
#pinecone.output <- pinecone.output.12.3
######################## testing ########################

#pinecone.clusters <- data.frame(pinecone.output$table,stringsAsFactors = T)
#pinecone.clusters$Sub.lineage.sing <- gsub("Singleton\\_.+","Singleton",pinecone.clusters$Sub.lineage,perl=T)
#write.csv(pinecone.clusters, file=paste0(Figure_output_directory, "Global-TPA.goodcov.rPinecone10-3.assignments_2020-11-11.csv"))

Alternatively, use a bootstrapped version of rPinecone clusters (run on command line due to the time taken to iterate over 100 trees) Testing external pinecone (took about 3 hrs to run rPinecone over 100 bootstrap trees for the full dataset)


pinecone.10 <- read.csv(pinecone.10.file, stringsAsFactors=F)

length(unique(pinecone.10$Sub.lineage))
[1] 37
length(unique(pinecone.10$pinecone_95))
[1] 190
length(unique(pinecone.10$pinecone_80))
[1] 66
length(unique(pinecone.10$pinecone_50))
[1] 46
length(unique(pinecone.10$pinecone_20))
[1] 38
length(unique(pinecone.10$pinecone_5))
[1] 25
plot__pinecone.bootstraps <- function(external.pinecone.clustering) {
  gg <- ggtree(TPA.MLtree)
  f2 <- facet_plot(gg, panel = "rPinecone ML tree", data = external.pinecone.clustering, geom = geom_tile, 
                   aes(x = as.numeric(factor(Sub.lineage))), fill = "blue")
  f2 <- facet_plot(f2, panel = "rPinecone 95%", data = external.pinecone.clustering, geom = geom_tile, 
                   aes(x = as.numeric(factor(pinecone_95))), fill = "red")
  f2 <- facet_plot(f2, panel = "rPinecone 80%", data = external.pinecone.clustering, geom = geom_tile, 
                   aes(x = as.numeric(factor(pinecone_80))), fill = "blue")
  f2 <- facet_plot(f2, panel = "rPinecone 50%", data = external.pinecone.clustering, geom = geom_tile, 
                   aes(x = as.numeric(factor(pinecone_50))), fill = "red")
  f2 <- facet_plot(f2, panel = "rPinecone 20%", data = external.pinecone.clustering, geom = geom_tile, 
                   aes(x = as.numeric(factor(pinecone_20))), fill = "blue")
  f2 <- facet_plot(f2, panel = "rPinecone 5%", data = external.pinecone.clustering, geom = geom_tile, 
                   aes(x = as.numeric(factor(pinecone_5))), fill = "red") +
    theme(strip.background =element_rect(fill="white"),strip.text = element_text(size= 8)) +
    theme.text.size
  return(f2)
}


p.pinecone.boootstrap.cluster.eval <- plot__pinecone.bootstraps(pinecone.10)

#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure3_Global-TPA.good-MLtree_vs_rPinecone-thresholds_02-2021.svg"), width = 600, height = 600,type="svg",units = "pt")
p.pinecone.boootstrap.cluster.eval

#dev.off()

rPinecone clusters are dependent on both SNP distance and tree topology. Bootstrapping these clusters is very hard, because the data are so clonal - particularly at the top of the tree. Removing just a few columns for a bootstrap can change the topology of the very clonal cluster. However, it is clear that many of these clusters elsewhere are still valid and reproducible at even very stringent support criteria. Therefore, for consistency, decided to require the same clusters to be present in at least 5% of trees - this robust allows amalgamation of the top clade into a coherent group.

Since paper was largely analysed using non-bootstrapped clusters, will aim to use the same ordering scheme (but only use robust clusters).

pinecone.10.3__5pc <- unique(pinecone.10[,c("Sub.lineage","pinecone_5")])

# want to rename bootstrapped clusters to be reasonably consistent with consensus tree analysis, and ensure sensible order on tree)
pinecone.10.3__5pc <- data.frame(pinecone.10 %>% dplyr::group_by(pinecone_5) %>% summarise(count=n()))
`summarise()` ungrouping output (override with `.groups` argument)
pinecone.10.3__5pc$is.singleton <- ifelse(pinecone.10.3__5pc$count==1,"singleton", "multi")
pinecone.10.3__5pc <- plyr::join(pinecone.10.3__5pc, unique(pinecone.10[,c("Sub.lineage","pinecone_5")]), by="pinecone_5", type='full')
pinecone.10.3__5pc$Sub.lineage.sing <- sapply(1:nrow(pinecone.10.3__5pc), function(x) ifelse(grepl("singleton",pinecone.10.3__5pc$Sub.lineage[x]),23,pinecone.10.3__5pc$Sub.lineage[x]))
pinecone.10.3__5pc <- pinecone.10.3__5pc[order(as.numeric(pinecone.10.3__5pc$Sub.lineage.sing),pinecone.10.3__5pc$Sub.lineage),]

# Now extract non-singletons and rename in order
new.names <- pinecone.10.3__5pc[pinecone.10.3__5pc$Sub.lineage.sing!=23,]
new.names <- data.frame(pinecone_5=unique(new.names$pinecone_5), stringsAsFactors = F)
new.names$pinecone_5_newname <- c(1:nrow(new.names))
# Now extract singletons and rename 
new.names.sing <- pinecone.10.3__5pc[pinecone.10.3__5pc$is.singleton=="singleton",]
new.names.sing <- data.frame(pinecone_5=unique(new.names.sing$pinecone_5), stringsAsFactors = F)
new.names.sing$pinecone_5_newname <- "Singleton"
# Combine new name list
new.names <- rbind(new.names,new.names.sing)
# integrate into list of types
pinecone.10.3__5pc <- plyr::join(pinecone.10.3__5pc, new.names, by="pinecone_5", type='left')

# now apply back to samples 
pinecone.10 <- plyr::join(pinecone.10, (unique(pinecone.10.3__5pc[,c("pinecone_5","pinecone_5_newname")])), by="pinecone_5", type="left")
pinecone.10$pinecone_5_newname.numeric <- as.numeric(sapply(1:nrow(pinecone.10), function(x) ifelse(grepl("Singleton",pinecone.10$pinecone_5_newname[x]),18,pinecone.10$pinecone_5_newname[x])))

re-import rPinecone classifications (original analysis)


pinecone.clusters2 <- read.csv(pinecone.clusters.MLoriginal.file, row.names=1, comment.char="", check.names=F)



pinecone.clusters <- data.frame(pinecone.clusters2, stringsAsFactors = F)

#TPA.meta1.2
colnames(pinecone.clusters) <- c("Sample_Name", "pinecone.sublin.raw","pinecone.major.lin.raw", "TPA.pinecone.sublineage.pyjar")

# do some relabelling of major lineages
pinecone.clusters$TPA.pinecone.major <- ifelse(pinecone.clusters$pinecone.major.lin.raw=="0","outlier", ifelse(pinecone.clusters$pinecone.major.lin.raw=="1", "SS14", "Nichols"))

Integrate pinecone data with full meta

TPA.meta1.2.pinecone <- plyr::join(pinecone.clusters[,c("Sample_Name","TPA.pinecone.sublineage.pyjar","TPA.pinecone.major")], TPA.meta1.2, by="Sample_Name", type="left")

TPA.meta1.2.pinecone <- plyr::join(data.frame(Sample_Name=pinecone.10$Taxa, TPA.pinecone.sublineage=pinecone.10$pinecone_5_newname, stringsAsFactors = F),TPA.meta1.2.pinecone, by="Sample_Name", type="left")
# How many Nichols are there?
nrow(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA_Lineage=="Nichols",])
[1] 102
# How many SS14 are there?
nrow(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA_Lineage=="SS14",])
[1] 426
# How many outliers are there? (these are technically SS14/Nichols, but also basal in the phylogeny)
nrow(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.major=="outlier",])
[1] 4
# what are the outlier (non-SS14/Nihols) genomes?
TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.major=="outlier",]

# How many Singletons are there?
nrow(TPA.meta1.2.pinecone[grepl("Singleton",TPA.meta1.2.pinecone$TPA.pinecone.sublineage),])
[1] 8
(TPA.meta1.2.pinecone[grepl("Singleton",TPA.meta1.2.pinecone$TPA.pinecone.sublineage),])

# How many Nichols Singletons are there?
nrow(TPA.meta1.2.pinecone[(grepl("Singleton",TPA.meta1.2.pinecone$TPA.pinecone.sublineage) & TPA.meta1.2.pinecone$TPA_Lineage=="Nichols"),])
[1] 4
# How many SS14 Singletons are there?
nrow(TPA.meta1.2.pinecone[(grepl("Singleton",TPA.meta1.2.pinecone$TPA.pinecone.sublineage) & TPA.meta1.2.pinecone$TPA_Lineage=="SS14"),])
[1] 4
(TPA.meta1.2.pinecone[(grepl("Singleton",TPA.meta1.2.pinecone$TPA.pinecone.sublineage) & TPA.meta1.2.pinecone$TPA_Lineage=="SS14"),])
NA

Of the countries that have both Nichols and SS14, what is the breakdown?

# Which countries have both SS14 and Nichols?
Countries.both.lineages <- data.frame(unique(TPA.meta1.2[,c("Geo_Country","TPA_Lineage")]) %>% dplyr::group_by(Geo_Country) %>% dplyr::summarise(count=n()))
`summarise()` ungrouping output (override with `.groups` argument)
Countries.both.lineages <- Countries.both.lineages[Countries.both.lineages$count==2,"Geo_Country"]

# Proportion of Nichols/SS14 in this subset of countries
TPA.meta1.2[TPA.meta1.2$Geo_Country %in% Countries.both.lineages,] %>% 
  dplyr::group_by(TPA_Lineage) %>%
  summarise(count=n()) %>%
  mutate(percentage=(count/sum(count))*100)
`summarise()` ungrouping output (override with `.groups` argument)
# and by country
Lineage.perc.country <- data.frame(TPA.meta1.2[TPA.meta1.2$Geo_Country %in% Countries.both.lineages,] %>% 
  dplyr::group_by(Geo_Country,TPA_Lineage) %>%
  summarise(count=n()) %>%
  mutate(percentage=(count/sum(count))*100),stringsAsFactors = F)
`summarise()` regrouping output by 'Geo_Country' (override with `.groups` argument)
Lineage.perc.country

median(Lineage.perc.country[Lineage.perc.country$TPA_Lineage=="SS14","percentage"])
[1] 75.2907

Description of the new dataset

# Total samples in good tree
nrow(TPA.meta1.2.pinecone)
[1] 528
# Total 'new' samples in good tree
unique(TPA.meta1.2.pinecone$Citation)
 [1] "This_Study"            "Beale_2019_Global_TPA" "Tong_2017"             "Sun_2016"              "Chen_2020"            
 [6] "Pětrošová_2013"        "Arora_2016"            "Pinto_2016"            "Grillová_2019"         "unpublished-Giacani"  
[11] "Čejková_2012"         
nrow(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Citation %in% c("unpublished-WSI" ,"unpublished-Taiaroa"),])
[1] 0
nrow(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Citation %notin% c("unpublished-WSI" ,"unpublished-Taiaroa"),])
[1] 528
nrow(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Citation %in% c("This_Study"),])
[1] 401
nrow(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Citation %notin% c("This_Study"),])
[1] 127

Define colours for sublineages

# Define sublineage clustering scheme using brew colourscales
#sublineages.cols.brew <- data.frame(unique(TPA.meta1.2.pinecone[,c("TPA.pinecone.major","TPA.pinecone.sublineage.pyjar")]), stringsAsFactors = F)
sublineages.cols.brew <- data.frame(unique(TPA.meta1.2.pinecone[,c("TPA.pinecone.major","TPA.pinecone.sublineage")]), stringsAsFactors = F)

sublineages.cols.brew <- sublineages.cols.brew[order(sublineages.cols.brew$TPA.pinecone.major,sublineages.cols.brew$TPA.pinecone.sublineage),]

sublineages.cols.brew$sublin.order <- as.numeric(as.character(sublineages.cols.brew$TPA.pinecone.sublineage))
NAs introduced by coercion
sublineages.cols.brew <- sublineages.cols.brew[order(sublineages.cols.brew$sublin.order),]



#sublineages.cols.brew$sublineage.cols <- c(brewer.pal(n=7,"Blues")[2:7],brewer.pal(n=6,"Purples")[2:6],"grey80",brewer.pal(n=4,"YlOrBr")[c(2,3)], brewer.pal(n=7,"Reds")[2:6],brewer.pal(n=5,"Greens")[2:5],"grey80") 

# For revised bootstrapped clusters
sublineages.cols.brew$sublineage.cols <- sublineages.cols.brew$sublineage.cols <- c("#FC9272","#EF3B2C",brewer.pal(n=4,"Greens")[2:4],brewer.pal(n=4,"YlOrBr")[c(2,3)],brewer.pal(n=6,"Blues")[2:6],brewer.pal(n=6,"Purples")[2:6],"grey80","grey80")
  


sublineages.cols.brew <- unique(sublineages.cols.brew[,c("TPA.pinecone.sublineage","sublineage.cols")])
sublineages.cols.brew <- sublineages.cols.brew[order(as.numeric(as.character(sublineages.cols.brew$TPA.pinecone.sublineage))),]
NAs introduced by coercion
sublineages.cols.brew$TPA.pinecone.sublineage <- factor(sublineages.cols.brew$TPA.pinecone.sublineage, levels=sublineages.cols.brew$TPA.pinecone.sublineage)

colnames(sublineages.cols.brew) <- c("sublineage","sublineage.cols")
sublineages.cols.brew <- unique(sublineages.cols.brew)

now plot trees

#TPA.MLtree.ggtree <- ggtree(TPA.MLtree,layout = "fan",open.angle = 20, right=T)
#TPA.pyjar.treeplot 

p.TPA.pyjar.treeplot.tipsublineages <- TPA.pyjar.treeplot %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F) + 
  geom_tippoint(aes(color=Sublineage), size=1, alpha=0.5) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) 

p.TPA.pyjar.treeplot.stripsublineages <- gheatmap(TPA.pyjar.treeplot,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F), color=NULL,width=0.1,offset=0.0000005, colnames_angle=-45,colnames_offset_y=0.25, font.size=3) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) 
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
p.TPA.pyjar.treeplot.stripsublineages

Now for ML gubbins tree

TPA.MLtree.ggtree <- ggtree(TPA.MLtree,layout = "fan",open.angle = 20, right=T)
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
TPA.MLtree.ggtree.tippoint <- TPA.MLtree.ggtree %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F) + 
  geom_tippoint(aes(color=Sublineage), size=0.75, alpha=0.5) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage)

p.TPA.MLtree.sublineages <- gheatmap(TPA.MLtree.ggtree.tippoint,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F), color=NULL,width=0.075,offset=0.00000025, colnames_angle=-45,colnames_offset_y=0.02, hjust=-0.0,font.size=2.25) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right')
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
#p.TPA.MLtree.sublineages


#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure2__goodcov-MLtree_circular__02-2021.svg"), width = 800, height = 800,type="svg",units = "pt")
p.TPA.MLtree.sublineages

#dev.off()

or a linear tree (which can also capture legend from)

TPA.ML.ggtree.linear <- ggtree(TPA.MLtree) %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F) + 
  geom_tippoint(aes(color=Sublineage), size=0.75, alpha=0.5, show.legend = F) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  # add bootstrap support
  geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=1.5, shape=18) +
  #geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=2.5, shape=18, alpha=0.5) +
  #geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=1.5, shape=18, color='cyan', alpha=0.5) +
  NULL

p.TPA.ML.ggtree.linear <- gheatmap(TPA.ML.ggtree.linear,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F), color=NULL,width=0.075,offset=0.00000025, colnames_angle=0,colnames_offset_y=-1, font.size=2) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.75,"line"),legend.position='bottom')
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
p.TPA.ML.ggtree.linear <- p.TPA.ML.ggtree.linear + new_scale_fill()

p.TPA.ML.ggtree.linear <- gheatmap(p.TPA.ML.ggtree.linear,TPA.rawseq.countries.p, color=NULL,width=0.075,offset=0.00000725, colnames_angle=0,colnames_offset_y=-1, font.size=2) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='bottom') + 
  geom_treescale(fontsize = 2.5, x=0.000001, y=50) +
  NULL
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
p.TPA.ML.ggtree.linear


# Capture legend from full tree
p.TPA.ML.ggtree.linear.legend <- get_legend(p.TPA.ML.ggtree.linear)
NAs introduced by coercionNAs introduced by coercion

Define subtrees using ‘collapse’ clade


#ggtree(TPA.MLtree,layout = "fan",open.angle = 20, right=T) + geom_text2(aes(subset=!isTip, label=node), hjust=-.3) 
#ggtree(TPA.MLtree) + geom_text2(aes(subset=!isTip, label=node), hjust=-.3, size=3) 

SS14.subtree.nodeid <- 530 # 529
Nichols.subtree.nodeid <- 955 

Nichols subtree (with collapsed SS14)


#TPA.ML.ggtree.linear + geom_text2(aes(subset=!isTip, label=node), hjust=-.3, size=3) 

Nichols.coll <- ggtree(TPA.MLtree) %>% collapse(node=SS14.subtree.nodeid)

# Collaping node reduces 'y' position, so lets add some back for proper spacing
Nichols.coll$data[Nichols.coll$data$node==SS14.subtree.nodeid,"y"] <- Nichols.coll$data[Nichols.coll$data$node==SS14.subtree.nodeid,"y"] + 10

# now add triangle and text
Nichols.coll <- Nichols.coll + geom_text2(aes(subset=(node == SS14.subtree.nodeid)), size=20, label=intToUtf8(9664), hjust=0.2,vjust=.55, family="OpenSansEmoji", color="indianred1", alpha=.75)
Nichols.coll <- Nichols.coll + geom_text2(aes(subset=(node == SS14.subtree.nodeid)), cex=3, vjust=0.2, label="SS14",hjust = -1.5)
Nichols.coll <- Nichols.coll %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F) + 
  geom_tippoint(aes(color=Sublineage), size=1, alpha=0.25,show.legend=F) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  # add bootstrap support
  geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=1.5, shape=18) +
  #geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=2.5, shape=18, alpha=0.5) +
  #geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=1.5, shape=18, color="cyan", alpha=0.5) +
  NULL


p.TPA.Nichols.coll <- gheatmap(Nichols.coll,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F), color=NULL,width=0.085,offset=0.00000025, colnames_angle=0,colnames_offset_y=-1, font.size=2) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.85,"line"),legend.position='none')

p.TPA.Nichols.coll <- p.TPA.Nichols.coll + new_scale_fill()

p.TPA.Nichols.coll <- gheatmap(p.TPA.Nichols.coll,TPA.rawseq.countries.p, color=NULL,width=0.085,offset=0.00001025, colnames_angle=0,colnames_offset_y=-1, font.size=2) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size + theme(legend.key.size = unit(0.75,"line"),legend.position='none') +
  geom_treescale(fontsize = 2.5, x=0.000002, y=25) + 
  ylim(-3,116) +
  #ggtitle("Nichols-lineage phylogeny") +
  NULL
p.TPA.Nichols.coll

SS14 subtree (with collapsed Nichols)


#TPA.ML.ggtree.linear + geom_text2(aes(subset=!isTip, label=node), hjust=-.3, size=3) 

SS14.coll <- ggtree(TPA.MLtree) %>% collapse(node=Nichols.subtree.nodeid)

# Collaping node reduces 'y' position, so lets add some back for proper spacing
SS14.coll$data[SS14.coll$data$node==Nichols.subtree.nodeid,"y"] <- SS14.coll$data[SS14.coll$data$node==Nichols.subtree.nodeid,"y"] - 25

SS14.coll <- SS14.coll + geom_text2(aes(subset=(node == Nichols.subtree.nodeid)), size=20, label=intToUtf8(9664), hjust=0.2,vjust=.55, family="OpenSansEmoji", color="royalblue2", alpha=.85)
SS14.coll <- SS14.coll + geom_text2(aes(subset=(node == Nichols.subtree.nodeid)), cex=3, vjust=0.2, label="Nichols",hjust = -1)
SS14.coll <- SS14.coll %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F) + 
  geom_tippoint(aes(color=Sublineage), size=1, alpha=0.25,show.legend=F) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  # add bootstrap support
  geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=1.5, shape=18) +
  #geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=2.0, shape=18, alpha=0.25) +
  #geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=1.5, shape=18, color="cyan", alpha=0.5) +
  NULL
#SS14.coll


p.TPA.SS14.coll <- gheatmap(SS14.coll,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F), color=NULL,width=0.085,offset=0.00000025, colnames_angle=0,colnames_offset_y=-1, font.size=2) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none')

p.TPA.SS14.coll <- p.TPA.SS14.coll + new_scale_fill()

p.TPA.SS14.coll <- gheatmap(p.TPA.SS14.coll,TPA.rawseq.countries.p, color=NULL,width=0.085,offset=0.00001025, colnames_angle=0,colnames_offset_y=-1, font.size=2) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none') +
  geom_treescale(fontsize = 2.5, x=0.000002, y=65) +
  #ggtitle("SS14-lineage phylogeny") +
  ylim(-30,429) +
  NULL

p.TPA.SS14.coll

Plot collapsed trees together


# make Nichols tree shorter
coll.trees.nichols.shorter <- plot_grid(NULL, p.TPA.Nichols.coll, NULL, ncol=1, rel_heights=c(2,7,1))

#coll.trees.row.1 <- plot_grid(p.TPA.Nichols.coll,p.TPA.SS14.coll,ncol=2, rel_widths=c(1,1), labels=c('Nichols','SS14'),label_size = 11)
#coll.trees.row.1 <- plot_grid(p.TPA.Nichols.coll,p.TPA.SS14.coll,ncol=2, rel_widths=c(1,1), labels=c('A','B'),label_size = 11)

#coll.trees.row.1 <- plot_grid(coll.trees.nichols.shorter,p.TPA.SS14.coll,ncol=2, rel_widths=c(1,1), labels=c('A','B'),label_size = 11)
coll.trees.row.1 <- plot_grid(p.TPA.SS14.coll,coll.trees.nichols.shorter,ncol=2, rel_widths=c(1,1), labels=c('A - SS14-lineage phylogeny','B - Nichols-lineage phylogeny'),label_size = 11)


coll.trees.row.2 <- plot_grid(p.TPA.ML.ggtree.linear.legend,ncol=1,labels='Key',label_size = 11)
coll.trees.combined <- plot_grid(coll.trees.row.1, coll.trees.row.2, ncol=1, rel_heights=c(3,1), scale=0.95)

coll.trees.combined

Sublineage timeline bubbleplot


#TPA.sublineages.counts <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Sample_Year!="-" & TPA.meta1.2.pinecone$TPA.pinecone.sublineage!="Singleton"),] %>% dplyr::group_by(Sample_Year, TPA.pinecone.sublineage) %>% 
#  dplyr::summarise(Count=n())

TPA.sublineages.counts <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Sample_Year!="-" ),] %>% dplyr::group_by(Sample_Year, TPA.pinecone.sublineage) %>% 
  dplyr::summarise(Count=n())



TPA.sublineages.counts$Sample_Year <- ifelse(TPA.sublineages.counts$Sample_Year=="1960-1980","1970",TPA.sublineages.counts$Sample_Year)

TPA.sublineages.counts <- plyr::join(data.frame(Sample_Year=c(1912:2019),stringsAsFactors=F), TPA.sublineages.counts)
TPA.sublineages.counts <- TPA.sublineages.counts[!is.na(TPA.sublineages.counts$TPA.pinecone.sublineage),]
TPA.sublineages.counts$TPA.pinecone.sublineage <- factor(TPA.sublineages.counts$TPA.pinecone.sublineage, levels=rev(sublineages.cols.brew$sublineage))


p.sublineage.temporal.bubbleplot <- ggplot(TPA.sublineages.counts, aes(Sample_Year, TPA.pinecone.sublineage, colour=TPA.pinecone.sublineage)) +
  geom_point(alpha=0.65, aes(size=Count)) + 
  guides(colour=FALSE) +
  theme_light() +
  scale_size_area(max_size = 8,breaks=c(1,5,10,25,50,75,100)) +
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right') +
  coord_cartesian(xlim=c(1950,2020)) +
  labs(y="Sublineage", x="Sample Year")

p.sublineage.temporal.bubbleplot.legend <- get_legend(p.sublineage.temporal.bubbleplot)

p.sublineage.temporal.bubbleplot

#sublineage.counts <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Sample_Year!="-" & TPA.meta1.2.pinecone$TPA.pinecone.sublineage!="Singleton"),] %>% group_by(TPA.pinecone.sublineage) %>% summarise(Count=n())
sublineage.counts <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Sample_Year!="-" ),] %>% group_by(TPA.pinecone.sublineage) %>% summarise(Count=n())
`summarise()` ungrouping output (override with `.groups` argument)
sublineage.counts$TPA.pinecone.sublineage <- factor(sublineage.counts$TPA.pinecone.sublineage,levels=rev(sublineages.cols.brew$sublineage))

p.sublineage.hbarplot <- ggplot(sublineage.counts, aes(Count,TPA.pinecone.sublineage,fill=TPA.pinecone.sublineage)) +
  geom_barh(stat="identity", position="stack", width=0.75) +
  theme_light() +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none') +
  #scale_fill_manual(name="Country",values=sublineage.counts$TPA.pinecone.sublineage, breaks=sublineage.counts$Geo_Country) +
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  geom_text(data=sublineage.counts, aes((Count+10), TPA.pinecone.sublineage,label=Count), size=2.5, inherit.aes = F) +
  labs(y="Sublineage", x="Samples/Sublineage")
p.sublineage.hbarplot  

Proportion by country

#sublineage.country.counts <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Sample_Year!="-" & TPA.meta1.2.pinecone$TPA.pinecone.sublineage!="Singleton"),] %>% group_by(TPA.pinecone.sublineage, Geo_Country) %>% summarise(Count=n())
sublineage.country.counts <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Sample_Year!="-" ),] %>% group_by(TPA.pinecone.sublineage, Geo_Country) %>% summarise(Count=n())
`summarise()` regrouping output by 'TPA.pinecone.sublineage' (override with `.groups` argument)
sublineage.country.counts$TPA.pinecone.sublineage <- factor(sublineage.country.counts$TPA.pinecone.sublineage,levels=rev(sublineages.cols.brew$sublineage))

p.sublineage.country.hbarplot <- ggplot(sublineage.country.counts, aes(Count,TPA.pinecone.sublineage,fill=Geo_Country)) +
  geom_barh(stat="identity", position="fill", width=0.75) +
  theme_light() +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none') +
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  labs(y="Sublineage", x="Country Proportion")
p.sublineage.country.hbarplot

p.sublineage.timeline.bubbleplot.combined <- plot_grid(p.sublineage.temporal.bubbleplot + theme(legend.position="none"), p.sublineage.hbarplot + y.theme.strip, p.sublineage.country.hbarplot + y.theme.strip, p.sublineage.temporal.bubbleplot.legend, ncol=4, align=T, rel_widths=c(5,2,2,1),labels=c('C','D','E',''),label_size = 11,vjust=-0.25) #+ theme(legend.position="right")


gg.colltrees.sublineage.distributions <- plot_grid(coll.trees.combined, p.sublineage.timeline.bubbleplot.combined, ncol=1, rel_heights=c(5,2))
#gg.colltrees.sublineage.distributions

#Cairo::Cairo(file=paste0(Figure_output_directory, "Figure2__goodcov-MLtree_+sublineage-distros__02-2021.svg"), width = 900, height = 900,type="svg",units = "pt")
gg.colltrees.sublineage.distributions

#dev.off()

Now take a deep dive into Single country/region sublineage dynamics (UK-wide v.s. British Columbia, Canada)


TPA.sublineage_UK.Canada.temporal <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Geo_Country=="UK" | TPA.meta1.2.pinecone$Geo_Region=="British_Columbia"),]

TPA.sublineage_UK.Canada.temporal.counts <- data.frame(TPA.sublineage_UK.Canada.temporal %>% dplyr::group_by(TPA.pinecone.sublineage,Sample_Year,Geo_Country,TPA.pinecone.major) %>% 
  dplyr::summarise(Sample.Count=n()), stringsAsFactors = F)
`summarise()` regrouping output by 'TPA.pinecone.sublineage', 'Sample_Year', 'Geo_Country' (override with `.groups` argument)
# Fix dates and make continuous
TPA.sublineage_UK.Canada.temporal.counts$Sample_Year <- as.numeric(TPA.sublineage_UK.Canada.temporal.counts$Sample_Year)


TPA.sublineage_UK.Canada.temporal.counts <- plyr::join(data.frame(Sample_Year=c(1912:2019),stringsAsFactors=F),TPA.sublineage_UK.Canada.temporal.counts, by="Sample_Year", type="left")

TPA.sublineage_UK.Canada.temporal.counts <- TPA.sublineage_UK.Canada.temporal.counts[!is.na(TPA.sublineage_UK.Canada.temporal.counts$TPA.pinecone.sublineage),]

# order by sublineage
TPA.sublineage_UK.Canada.temporal.counts$TPA.pinecone.sublineage <- factor(TPA.sublineage_UK.Canada.temporal.counts$TPA.pinecone.sublineage, levels=rev(sublineages.cols.brew$sublineage))

# make bubbleplot
plot.TPA.sublineage_UK.Canada.temporal.counts.bubbleplot <- 
  ggplot(TPA.sublineage_UK.Canada.temporal.counts, aes(Sample_Year, TPA.pinecone.sublineage, color=TPA.pinecone.sublineage)) +
  geom_point(alpha=0.70,aes(size=Sample.Count)) + 
  geom_line(alpha=0.25) +
  theme_light() +
  labs(x="Sample Year", y="Sublineage", size="Sample Count") +
  coord_cartesian(xlim=c(2000,2020)) +
  theme(strip.text.y = element_text(angle = 0)) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  facet_grid(.~Geo_Country) + 
  scale_size_area(max_size = 10,breaks=c(1,5,10,20,30,40,50)) +
  theme.text.size + 
  theme(legend.key.size = unit(0.65,"line")) +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.x = element_text(color = "grey25")) +
  NULL
plot.TPA.sublineage_UK.Canada.temporal.counts.bubbleplot 

NA
NA

get population prevalence stats


UK.stats <- read.table(UK.stats.file,sep="\t",header=T)
UK.stats$Country <- "UK"


BC.stats <- read.table(BC.stats.file,sep="\t",header=T)
BC.stats$Country <- "British Columbia"

UK.BC.stats.combined <- rbind(UK.stats,BC.stats)
UK.BC.stats.combined[UK.BC.stats.combined$Country=="UK","Country"] <- "England"


plot.UK.BC.stats.combined <- ggplot(UK.BC.stats.combined, aes(Year,Total)) + 
  geom_line() +
  theme_light() +
  labs(x="Year",y="Syphilis Incidence/100,000") + 
  #scale_x_continuous(breaks=seq(2008,2019,2)) +
  coord_cartesian(xlim=c(2000,2020)) +
  #ggtitle("Syphilis Incidence data") +
  facet_grid(.~Country) +
  theme.text.size +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.x = element_text(color = "grey25", size=10))
plot.UK.BC.stats.combined

plot_grid(plot.UK.BC.stats.combined + x.theme.strip + ggtitle("Syphilis Incidence and sublineage count"), plot.TPA.sublineage_UK.Canada.temporal.counts.bubbleplot + theme(legend.position="bottom",strip.background = element_blank(),strip.text.x = element_blank()), ncol=1, align=T, rel_heights=c(1,2), labels=c('A','B'), label_size=11)

some stats about the canadian outbreak

BC.sublineage.summary <- data.frame(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Geo_Region=="British_Columbia",] %>% dplyr::group_by(TPA.pinecone.sublineage,Geo_Country,TPA.pinecone.major) %>% dplyr::summarise(Sample.Count=n()), stringsAsFactors = F)
`summarise()` regrouping output by 'TPA.pinecone.sublineage', 'Geo_Country' (override with `.groups` argument)
(BC.sublineage.summary[BC.sublineage.summary$TPA.pinecone.sublineage==1,"Sample.Count"]/sum(BC.sublineage.summary$Sample.Count))*100
[1] 82.14286
BC.sublineage.summary.pre2010 <- data.frame(TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Geo_Region=="British_Columbia" & TPA.meta1.2.pinecone$Sample_Year<=2010),] %>% dplyr::group_by(TPA.pinecone.sublineage,Geo_Country,TPA.pinecone.major) %>% dplyr::summarise(Sample.Count=n()), stringsAsFactors = F)
`summarise()` regrouping output by 'TPA.pinecone.sublineage', 'Geo_Country' (override with `.groups` argument)
(BC.sublineage.summary.pre2010[BC.sublineage.summary.pre2010$TPA.pinecone.sublineage==1,"Sample.Count"]/sum(BC.sublineage.summary.pre2010$Sample.Count))*100
[1] 85.71429
BC.sublineage.summary.post2011 <- data.frame(TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Geo_Region=="British_Columbia" & TPA.meta1.2.pinecone$Sample_Year>=2011),] %>% dplyr::group_by(TPA.pinecone.sublineage,Geo_Country,TPA.pinecone.major) %>% dplyr::summarise(Sample.Count=n()), stringsAsFactors = F)
`summarise()` regrouping output by 'TPA.pinecone.sublineage', 'Geo_Country' (override with `.groups` argument)
(BC.sublineage.summary.post2011[BC.sublineage.summary.post2011$TPA.pinecone.sublineage==1,"Sample.Count"]/sum(BC.sublineage.summary.post2011$Sample.Count))*100
[1] 81.42857

and the UK dataset

UK.sublineage.summary <- data.frame(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Geo_Country=="UK",] %>% dplyr::group_by(TPA.pinecone.sublineage,Geo_Country,TPA.pinecone.major) %>% dplyr::summarise(Sample.Count=n()), stringsAsFactors = F)
`summarise()` regrouping output by 'TPA.pinecone.sublineage', 'Geo_Country' (override with `.groups` argument)


# Now look more globally
Sublineages per country - note that this plot does not account for multiple Singletons being in the same country - replaced.

sublineage.count.per.country <- data.frame(unique(TPA.meta1.2.pinecone[,c("TPA.pinecone.sublineage","Geo_Country")]) %>% dplyr::group_by(Geo_Country) %>%
  dplyr::summarise(count=n()), stringsAsFactors = F)
`summarise()` ungrouping output (override with `.groups` argument)
sublineage.count.per.country$Geo_Country <- factor(sublineage.count.per.country$Geo_Country, levels=continental.country.cols.brew2$Geo_Country)


p.sublineages.per.country.bar <- ggplot(sublineage.count.per.country, aes(Geo_Country, count, fill=Geo_Country)) + 
  geom_bar(stat='identity', width=0.75) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) + 
  theme_light() +
  x.theme.axis.rotate + 
  theme.text.size +
  labs(x="Country", y="Sublineages/Country") +
  theme(legend.key.size = unit(0.65,"line"),legend.position='bottom') + 
  scale_y_continuous(breaks=seq(0,14,2))

p.sublineages.per.country.bar

Look at Singleton, private, and multi-country sublineages

# How many Singleton lineages are in each country?
Singleton.country.counts <- data.frame(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage=="Singleton",c("Sample_Name","Geo_Country")] %>% 
  dplyr::group_by(Geo_Country) %>%
  dplyr::summarise(per.country=n()), stringsAsFactors = F)
`summarise()` ungrouping output (override with `.groups` argument)
Singleton.country.counts <- plyr::join(data.frame(Geo_Country=c(unique(TPA.meta1.2.pinecone$Geo_Country))), Singleton.country.counts, by="Geo_Country", type="left")
Singleton.country.counts[is.na(Singleton.country.counts$per.country),"per.country"] <- 0
Singleton.country.counts$lineage.type <- "Singleton in country"

Singleton.country.counts
TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage=="Singleton","TPA_Lineage"]
[1] "SS14"    "SS14"    "SS14"    "SS14"    "Nichols" "Nichols" "Nichols" "Nichols"
Private.country.counts <- unique(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage!="Singleton",c("TPA.pinecone.sublineage", "Geo_Country")]) %>% 
  dplyr::group_by(TPA.pinecone.sublineage) %>%
  summarise(Countries.count=n(),.groups="keep")
Private.country.counts$private.distro <- ifelse(Private.country.counts$Countries.count==1,"private","multicountry")

Private.country.counts

# How many private lineages are in each country?
Private.country.locations <- unique(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage %in% as.character(unlist(Private.country.counts[Private.country.counts$private.distro=='private',"TPA.pinecone.sublineage"])),c(c("TPA.pinecone.sublineage", "Geo_Country"))])
Private.country.location.counts <- Private.country.locations %>% dplyr::group_by(Geo_Country) %>%
  dplyr::summarise(per.country=n())
`summarise()` ungrouping output (override with `.groups` argument)
Private.country.location.counts <- plyr::join(data.frame(Geo_Country=c(unique(TPA.meta1.2.pinecone$Geo_Country)),stringsAsFactors = F), Private.country.location.counts, by="Geo_Country", type="left")
Private.country.location.counts[is.na(Private.country.location.counts$per.country),"per.country"] <- 0
Private.country.location.counts$lineage.type <- "Private sublineage to country"

# How many multicountry lineages are in each country?
multi.country.locations.counts <- unique(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage %in% as.character(unlist(Private.country.counts[Private.country.counts$private.distro=='multicountry',"TPA.pinecone.sublineage"])),c(c("TPA.pinecone.sublineage", "Geo_Country"))])
multi.country.locations.counts <- multi.country.locations.counts %>% dplyr::group_by(Geo_Country) %>%
  dplyr::summarise(per.country=n())
`summarise()` ungrouping output (override with `.groups` argument)
multi.country.locations.counts <- plyr::join(data.frame(Geo_Country=c(unique(TPA.meta1.2.pinecone$Geo_Country)),stringsAsFactors = F), multi.country.locations.counts, by="Geo_Country", type="left")
multi.country.locations.counts[is.na(multi.country.locations.counts$per.country),"per.country"] <- 0
multi.country.locations.counts$lineage.type <- "Multi-country sublineage"

classified.sublineages.per.country <- rbind(multi.country.locations.counts,Private.country.location.counts,Singleton.country.counts)
classified.sublineages.per.country$Geo_Country <- factor(classified.sublineages.per.country$Geo_Country, levels=continental.country.cols.brew2$Geo_Country)
classified.sublineages.per.country$lineage.type <- factor(classified.sublineages.per.country$lineage.type, levels=rev(unique(classified.sublineages.per.country$lineage.type)))

plot.classified.sublineages.per.country <- ggplot(classified.sublineages.per.country, aes(Geo_Country, per.country, fill=lineage.type)) +
  geom_bar(position="stack", stat="identity", width=0.75) +
  theme_light() + 
  theme.text.size + 
  x.theme.axis.rotate +
  labs(x="Country",y="Sublineage/Country", fill="Sublineage Type") +
  scale_y_continuous(breaks=seq(0,18,2)) +
  theme(legend.key.size = unit(0.65,"line"),legend.position='right') + 
  ggtitle("Sublineage types by country") + 
  theme(plot.title = element_text(size = 10)) +
  scale_fill_manual(values=c("grey80","grey50","grey10"))
plot.classified.sublineages.per.country


plot.classified.sublineages.per.country.hbar <- ggplot(classified.sublineages.per.country, aes(per.country,Geo_Country, fill=lineage.type)) +
  geom_barh(position="stack", stat="identity", width=0.75) +
  theme_light() + 
  theme.text.size + 
  #x.theme.axis.rotate +
  labs(y="Country",x="Sublineage Count", fill="Sublineage Type") +
  scale_x_continuous(breaks=seq(0,18,2)) +
  theme(legend.key.size = unit(0.65,"line"),legend.position='right') + 
  #ggtitle("Sublineage types by country") + 
  theme(plot.title = element_text(size = 10)) +
  scale_fill_manual(values=c("grey80","grey50","grey10"))
plot.classified.sublineages.per.country.hbar



#plot_grid(p.sublineages.per.country.bar + theme(legend.position="none") + ggtitle("Number of sublineages in each country") + theme(plot.title = element_text(size=10)) + x.theme.strip, plot.classified.sublineages.per.country +theme(legend.position="bottom"), align=T, ncol=1, rel_heights=c(1,3))

Look at number of countries for each sublineage (v.s. sampling counts)

sublineage.country.distro.vs.total.counts <- plyr::join(data.frame(Private.country.counts,stringsAsFactors = F), sublineage.counts, by="TPA.pinecone.sublineage", type="left")

#library(ggrepel)
ggplot(sublineage.country.distro.vs.total.counts, aes(Count, Countries.count, size=Count, colour=TPA.pinecone.sublineage)) +
  geom_jitter(alpha=0.75) + 
  #geom_label_repel(data=sublineage.country.distro.vs.total.counts, aes(Count, Countries.count, label=TPA.pinecone.sublineage), segment.color = 'grey50', inherit.aes = F, size=2.5, box.padding=0.75) +
  #geom_point(alpha=0.75) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) + 
  theme_light() + 
  scale_size_binned(range = c(1, 10),breaks=c(5,20,50,100,200,400)) +
  scale_x_log10() +
  scale_y_continuous(breaks=seq(0,20,2)) +
  labs(y="Countries (Count)", x="Samples (Count)", size="Sample Count") +
  theme(legend.key.size = unit(0.65,"line"),legend.position='bottom') 

Summary of sublineage classifications


sublineage.classification <- Private.country.counts
sublineage.classification <- sublineage.classification[order(as.numeric(sublineage.classification$TPA.pinecone.sublineage)),]

# How many private sublineages?
nrow(sublineage.classification[sublineage.classification$private.distro=='private',])
[1] 8
# How many mult-country sublineages?
nrow(sublineage.classification[sublineage.classification$private.distro=='multicountry',])
[1] 9
# Where are the Singletons
Singleton.country.counts[Singleton.country.counts$per.country!=0,]

# Where are the private lineages?
Private.country.location.counts[Private.country.location.counts$per.country!=0,]
NA

Pairwise SNP analyses

# import multiple sequence alignment
TPA.WGS.alignment.data <- pairsnp::import_fasta_sparse(TPA.MSA.SNPs.aln.file)

# run pairsnp
TPA.WGS.alignment.data.dist <- pairsnp::snp_dist(TPA.WGS.alignment.data)
TPA.WGS.alignment.data.dist.melt <- reshape2::melt(TPA.WGS.alignment.data.dist)
colnames(TPA.WGS.alignment.data.dist.melt) <- c("Taxa1", "Taxa2", "Distance")

# Bring in and merge metadata
TPA.meta1.2.pairwise.t1 <- TPA.meta1.2.pinecone[,c("Cleaned_fastq_id","Sample_Name","Sample_Year","Geo_Country","Continent","TPA.pinecone.major","TPA.pinecone.sublineage", "TPA_Lineage")]
colnames(TPA.meta1.2.pairwise.t1) <- paste0(colnames(TPA.meta1.2.pairwise.t1),".t1")
colnames(TPA.meta1.2.pairwise.t1)[2] <- "Taxa1"
TPA.meta1.2.pairwise.t2 <- TPA.meta1.2.pinecone[,c("Cleaned_fastq_id","Sample_Name","Sample_Year","Geo_Country","Continent","TPA.pinecone.major","TPA.pinecone.sublineage", "TPA_Lineage")]
colnames(TPA.meta1.2.pairwise.t2) <- paste0(colnames(TPA.meta1.2.pairwise.t2),".t2")
colnames(TPA.meta1.2.pairwise.t2)[2] <- "Taxa2"

TPA.alignment.data.dist.melt.meta <- plyr::join(TPA.WGS.alignment.data.dist.melt,TPA.meta1.2.pairwise.t1, by="Taxa1", type="left") 
TPA.alignment.data.dist.melt.meta <- plyr::join(TPA.alignment.data.dist.melt.meta,TPA.meta1.2.pairwise.t2, by="Taxa2", type="left")

Define comparisons

# Same sample
TPA.alignment.data.dist.melt.meta$same.sample <- ifelse(TPA.alignment.data.dist.melt.meta$Taxa1==TPA.alignment.data.dist.melt.meta$Taxa2,"same", "different")

# Years between samples
TPA.alignment.data.dist.melt.meta$year.distance <- abs(as.numeric(TPA.alignment.data.dist.melt.meta$Sample_Year.t1) - as.numeric(TPA.alignment.data.dist.melt.meta$Sample_Year.t2))
NAs introduced by coercionNAs introduced by coercion
# Same country
TPA.alignment.data.dist.melt.meta$same.country <- ifelse(TPA.alignment.data.dist.melt.meta$Geo_Country.t1 == TPA.alignment.data.dist.melt.meta$Geo_Country.t2, "same", "different")

# Same continent
TPA.alignment.data.dist.melt.meta$same.continent <- ifelse(TPA.alignment.data.dist.melt.meta$Continent.t1 == TPA.alignment.data.dist.melt.meta$Continent.t2, "same", "different")

# Same TPA Major Lineage
TPA.alignment.data.dist.melt.meta$same.TPA.majorlineage <- ifelse(TPA.alignment.data.dist.melt.meta$TPA.pinecone.major.t1==TPA.alignment.data.dist.melt.meta$TPA.pinecone.major.t2, "same", "different")
TPA.alignment.data.dist.melt.meta$same.TPA.majorlineage <- sapply(1:nrow(TPA.alignment.data.dist.melt.meta), function(x) ifelse((TPA.alignment.data.dist.melt.meta$TPA.pinecone.major.t1[x]=="0" | TPA.alignment.data.dist.melt.meta$TPA.pinecone.major.t2[x]=="0"),NA,TPA.alignment.data.dist.melt.meta$same.TPA.majorlineage[x]))

# Same TPA Lineage (cleaned up classifications)
TPA.alignment.data.dist.melt.meta$same.TPA.Lineage <- ifelse(TPA.alignment.data.dist.melt.meta$TPA_Lineage.t1==TPA.alignment.data.dist.melt.meta$TPA_Lineage.t2, "same", "different")
TPA.alignment.data.dist.melt.meta$same.TPA.Lineage <- sapply(1:nrow(TPA.alignment.data.dist.melt.meta), function(x) ifelse((TPA.alignment.data.dist.melt.meta$TPA_Lineage.t1[x]=="0" | TPA.alignment.data.dist.melt.meta$TPA_Lineage.t2[x]=="0"),NA,TPA.alignment.data.dist.melt.meta$same.TPA.Lineage[x]))


# Same TPA sublineage
TPA.alignment.data.dist.melt.meta$same.TPA.Pinecone.cluster <- ifelse(TPA.alignment.data.dist.melt.meta$TPA.pinecone.sublineage.t1==TPA.alignment.data.dist.melt.meta$TPA.pinecone.sublineage.t2,"same", "different")
TPA.alignment.data.dist.melt.meta$same.TPA.Pinecone.cluster <- sapply(1:nrow(TPA.alignment.data.dist.melt.meta), function(x) ifelse(((TPA.alignment.data.dist.melt.meta$same.sample[x]=="different" & TPA.alignment.data.dist.melt.meta$TPA.pinecone.sublineage.t1[x]=="Singleton") |(TPA.alignment.data.dist.melt.meta$same.sample[x]=="different" & TPA.alignment.data.dist.melt.meta$TPA.pinecone.sublineage.t2[x]=="Singleton")),"different",TPA.alignment.data.dist.melt.meta$same.TPA.Pinecone.cluster[x]))

# Country Comparisons label
TPA.alignment.data.dist.melt.meta$Country_combinations <- paste0(TPA.alignment.data.dist.melt.meta$Geo_Country.t1,"___",TPA.alignment.data.dist.melt.meta$Geo_Country.t2)

Do some analysis of SNP distances within each country


TPA.alignment.data.dist.melt.meta$Geo_Country.t1 <- factor(TPA.alignment.data.dist.melt.meta$Geo_Country.t1, levels=continental.country.cols.brew2$Geo_Country)

p.pairwise.snps.withinCountry <- ggplot(TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.country=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different"),], aes(Geo_Country.t1, Distance, color=Geo_Country.t1)) +
  geom_sina(alpha=0.25, scale='width', method="d") +
  theme_light() +
  theme.text.size + 
  #theme(axis.text.x = element_text(angle = -45, vjust = 0.5, hjust=0.1)) +
  x.theme.axis.rotate +
  labs(x="Country",y="Pairwise SNPs") +
  scale_color_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme(legend.key.size = unit(0.65,"line"),legend.position='right') + 
  ggtitle("Pairwise SNP distributions between samples from same countries") + 
  theme(plot.title = element_text(size = 10)) +
  NULL
p.pairwise.snps.withinCountry

Split up by Major Lineage

#TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.country=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different" & TPA.alignment.data.dist.melt.meta$same.TPA_Lineage=="same"),]

# Max pairwise distance within country and within Lineages
max(TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.country=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different" & TPA.alignment.data.dist.melt.meta$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta$TPA_Lineage.t1=="SS14"),"Distance"])
[1] 26
max(TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.country=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different" & TPA.alignment.data.dist.melt.meta$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta$TPA_Lineage.t1=="Nichols"),"Distance"])
[1] 80
p.pairwise.snps.withinCountry.within.Lineage <- ggplot(TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.country=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different" & TPA.alignment.data.dist.melt.meta$same.TPA.majorlineage=="same"),], aes(Geo_Country.t1, Distance, color=Geo_Country.t1)) +
  geom_sina(alpha=0.5, scale='width', method="d") +
  theme_light() +
  theme.text.size + 
  x.theme.axis.rotate +
  labs(x="Country",y="Pairwise SNPs") +
  scale_color_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme(legend.key.size = unit(0.65,"line"),legend.position='right') + 
  ggtitle("Pairwise SNP distributions between samples from same countries by lineage") + 
  theme(plot.title = element_text(size = 10)) +
  facet_grid(TPA_Lineage.t1~.) +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.y = element_text(color = "grey25", size=10)) +
  NULL
p.pairwise.snps.withinCountry.within.Lineage

NA
NA

Do a combined plot (all, plus by major lineage)


TPA.alignment.data.dist.melt.meta.LineageCountryCombined <- TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.country=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different" & TPA.alignment.data.dist.melt.meta$same.TPA.majorlineage=="same"),c("Geo_Country.t1", "Distance","TPA_Lineage.t1")]
TPA.alignment.data.dist.melt.meta.LineageCountryCombined$TPA_Lineage <- TPA.alignment.data.dist.melt.meta.LineageCountryCombined$TPA_Lineage.t1

TPA.alignment.data.dist.melt.meta.LineageCountryCombined.2 <- TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.country=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different"),c("Geo_Country.t1", "Distance","TPA_Lineage.t1")]
TPA.alignment.data.dist.melt.meta.LineageCountryCombined.2$TPA_Lineage <- "All"


TPA.alignment.data.dist.melt.meta.LineageCountryCombined <- rbind(TPA.alignment.data.dist.melt.meta.LineageCountryCombined, TPA.alignment.data.dist.melt.meta.LineageCountryCombined.2)

p.pairwise.snps.withinCountry.within.all.Lineage <- ggplot(TPA.alignment.data.dist.melt.meta.LineageCountryCombined, aes(Geo_Country.t1, Distance, color=Geo_Country.t1)) +
  geom_sina(alpha=0.5, scale='width', method="d") +
  theme_light() +
  theme.text.size + 
  x.theme.axis.rotate +
  labs(x="Country",y="Pairwise SNPs") +
  scale_color_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme(legend.key.size = unit(0.65,"line"),legend.position='right') + 
  ggtitle("Pairwise SNP distributions among samples within the same country") +
  
  theme(plot.title = element_text(size = 10)) +
  facet_grid(TPA_Lineage~.) +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.y = element_text(color = "grey25", size=10)) +
  NULL
p.pairwise.snps.withinCountry.within.all.Lineage

Look at pairwise distances between countries (e.g. minimum pairwise distance)

TPA.alignment.data.dist.melt.meta.between.countries <- TPA.alignment.data.dist.melt.meta[TPA.alignment.data.dist.melt.meta$same.country=="different",]

TPA.alignment.data.dist.melt.meta.between.countries.mindist <- TPA.alignment.data.dist.melt.meta.between.countries %>% 
  dplyr::group_by(Country_combinations) %>%
  summarise(min.dist=min(Distance))
`summarise()` ungrouping output (override with `.groups` argument)
TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa1 <- gsub("\\_\\_\\_.+$","",TPA.alignment.data.dist.melt.meta.between.countries.mindist$Country_combinations, perl=T)
TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa2 <- gsub("^.+\\_\\_\\_","",TPA.alignment.data.dist.melt.meta.between.countries.mindist$Country_combinations, perl=T)

TPA.alignment.data.dist.melt.meta.between.countries.mindist$log10.mindist <- log10(TPA.alignment.data.dist.melt.meta.between.countries.mindist$min.dist)

TPA.alignment.data.dist.melt.meta.between.countries.mindist$log2.mindist <- log2(TPA.alignment.data.dist.melt.meta.between.countries.mindist$min.dist)

TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa1 <- factor(TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa1, levels=continental.country.cols.brew2$Geo_Country)
TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa2 <- factor(TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa2, levels=continental.country.cols.brew2$Geo_Country)


#WGS.alignment.data.dist.melt.meta.between.countries.mindist.matrix <- dcast(WGS.alignment.data.dist.melt.meta.between.countries.mindist[,c(3,4,2)], taxa1~taxa2)

taxa1.pwise.country.cols <- unique(TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa1)


p.country.minsnp.heatmap <- ggplot(TPA.alignment.data.dist.melt.meta.between.countries.mindist, aes(Taxa1, Taxa2, fill=min.dist)) +
  geom_tile(color="white") +
  scale_fill_gradient(low="yellow",high="red", trans="log2",name="Minimum Pairwise SNPs") + 
  #theme_classic() +
  theme_light() +
  theme.text.size + 
  scale_x_discrete(position = 'top') +
  #theme(axis.text.x=element_text(angle=90,hjust=0), axis.title.x = element_blank(),
  #      axis.title.y = element_blank()) +
  theme(axis.text.x=element_text(angle=90,vjust=1,hjust=0), axis.title.x = element_blank()) +
  geom_text(aes(label = min.dist), color = "black", size = 2.5) +
  #theme(axis.text.x = element_text(colour=data.frame(continental.country.cols.brew2)[continental.country.cols.brew2$Geo_Country!="Belgium","country.col"]), axis.text.y = element_text(colour=data.frame(continental.country.cols.brew2)[continental.country.cols.brew2$Geo_Country!="Belgium","country.col"])) +
  theme(axis.text.y = element_text(colour=data.frame(continental.country.cols.brew2)[continental.country.cols.brew2$Geo_Country!="Belgium","country.col"]))  +
  theme(legend.key.size = unit(0.65,"line"), legend.position='left') +
  #ggtitle("Minimum Pairwise SNPs between samples from different countries") +
  theme(plot.title = element_text(size = 10)) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + 
  labs(y="Country") +
  NULL
Vectorized input to `element_text()` is not officially supported.
Results may be unexpected or may change in future versions of ggplot2.
p.country.minsnp.heatmap

Samples per country (high quality genomes) - needed to give some context to the heatmap plot

TPA.pinecone.genome.counts <- TPA.meta1.2.pinecone %>% dplyr::group_by(Geo_Country) %>%
  dplyr::summarise(Total_samples=n())
`summarise()` ungrouping output (override with `.groups` argument)
TPA.pinecone.genome.counts$Geo_Country <- factor(TPA.pinecone.genome.counts$Geo_Country, levels=continental.country.cols.brew2$Geo_Country)

p.hq.country.hbarplot <- ggplot(TPA.pinecone.genome.counts, aes(Total_samples,Geo_Country,fill=Geo_Country)) +
  geom_barh(stat="identity", position="stack", width=0.75) +
  geom_text(data=TPA.pinecone.genome.counts, aes((Total_samples+25), Geo_Country, label=Total_samples), size=2.5, inherit.aes = F) +
  #theme_classic() + 
  theme_light() +
  scale_x_continuous(breaks=c(0,100,200,300)) +
  coord_cartesian(xlim=c(0,275)) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none') +
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  labs(y="Country", x="Sample Count") +
  #theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) 
  NULL
#p.hq.country.hbarplot


p.hq.country.barplot <- ggplot(TPA.pinecone.genome.counts, aes(Geo_Country, Total_samples,fill=Geo_Country)) +
  geom_bar(stat="identity", position="stack", width=0.75) +
  geom_text(data=TPA.pinecone.genome.counts, aes(Geo_Country, (Total_samples+25), label=Total_samples), size=2.5, inherit.aes = F) +
  theme_light() +
  scale_y_continuous(breaks=c(0,100,200,300)) +
  coord_cartesian(ylim=c(0,275)) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none') +
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  labs(x="Country", y="Sample Count") +
  x.theme.axis.rotate +
  NULL
#p.hq.country.hbarplot

plot together

p.country.minsnp.heatmap.counts <- plot_grid(p.country.minsnp.heatmap, p.hq.country.hbarplot + y.theme.strip, align=T, ncol=2, rel_widths=c(5,1))
Transformation introduced infinite values in discrete y-axisTransformation introduced infinite values in discrete y-axisGraphs cannot be vertically aligned unless the axis parameter is set. Placing graphs unaligned.
p.country.minsnp.heatmap.counts 

Plot sina of within-country SNPs alongside heatmap of min-SNPs between countries

multicountry.pairwise.snps.grid <- plot_grid(p.pairwise.snps.withinCountry,p.country.minsnp.heatmap.counts, ncol=2, rel_widths=c(2,3), labels=c('D','E'), label_size=11)
multicountry.pairwise.snps.grid

Get combination details for all zero pairings

# ensure all possible combos are included by 2-siding
TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations <- rbind(data.frame(min.dist=TPA.alignment.data.dist.melt.meta.between.countries.mindist$min.dist, taxa1=TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa1,taxa2=TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa2, stringsAsFactors = F),data.frame(min.dist=TPA.alignment.data.dist.melt.meta.between.countries.mindist$min.dist, taxa1=TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa2,taxa2=TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa1, stringsAsFactors = F)) 
TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations <- unique(TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations)

TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations.zeros <- TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations[TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations$min.dist==0,]

TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations.zeros.combos <- TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations.zeros %>% dplyr::group_by(taxa1, taxa2) %>% dplyr::summarise(count=n())
`summarise()` regrouping output by 'taxa1' (override with `.groups` argument)
nrow(TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations.zeros.combos)/2
[1] 27
TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations.zeros.combos
NA

Get specific pairwise stats for UK and Canada


TPA.alignment.data.dist.melt.meta.UK.Canada <- (TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$Country_combinations=="UK___Canada" | TPA.alignment.data.dist.melt.meta$Country_combinations=="Canada___UK"),c("Taxa1","Taxa2","Distance", "Country_combinations","year.distance")])

# Number of comparisons
nrow(TPA.alignment.data.dist.melt.meta.UK.Canada[TPA.alignment.data.dist.melt.meta.UK.Canada$Distance==0,])
[1] 2622
Canada.UK.zero.comparison.samples <- as.character(unique(TPA.alignment.data.dist.melt.meta.UK.Canada[TPA.alignment.data.dist.melt.meta.UK.Canada$Distance==0,"Taxa1"]))
Canada.UK.zero.comparison.samples <- unique(c(Canada.UK.zero.comparison.samples,as.character(unique(TPA.alignment.data.dist.melt.meta.UK.Canada[TPA.alignment.data.dist.melt.meta.UK.Canada$Distance==0,"Taxa2"]))))

Canada.UK.zero.comparison.samples <- data.frame(Sample_Name=Canada.UK.zero.comparison.samples,stringsAsFactors = F)
Canada.UK.zero.comparison.samples <- plyr::join(Canada.UK.zero.comparison.samples,TPA.meta1.2, by="Sample_Name", type="left")

nrow(Canada.UK.zero.comparison.samples)
[1] 134
nrow(Canada.UK.zero.comparison.samples[Canada.UK.zero.comparison.samples$Geo_Country=="UK",])
[1] 78
nrow(Canada.UK.zero.comparison.samples[Canada.UK.zero.comparison.samples$Geo_Country=="Canada",])
[1] 56
# Look at temporal distance within zero-SNP comparisons between countries
sort(unique(TPA.alignment.data.dist.melt.meta.UK.Canada[TPA.alignment.data.dist.melt.meta.UK.Canada$Distance==0,"year.distance"]))
 [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14
# Identify samples invovled in a 0 SNP comparison and are 14 years apart
unique(as.vector(as.matrix(TPA.alignment.data.dist.melt.meta.UK.Canada[(TPA.alignment.data.dist.melt.meta.UK.Canada$Distance==0 & TPA.alignment.data.dist.melt.meta.UK.Canada$year.distance==14),c("Taxa1","Taxa2")])))
[1] "TPA_UKBIR049" "TPA_UKBIR030" "TPA_BCC030"   "TPA_BCC032"   "TPA_UKMAN027" "TPA_UKLEE004"
TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Sample_Name %in% c("TPA_UKBIR049", "TPA_UKBIR030", "TPA_BCC030", "TPA_BCC032", "TPA_UKMAN027", "TPA_UKLEE004"),]

TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$Taxa1 %in% c("TPA_BCC030","TPA_BCC032") & TPA.alignment.data.dist.melt.meta$Taxa2 %in% c("TPA_BCC030","TPA_BCC032")),]
#"TPA_BCC030", "TPA_BCC032" 


min(Canada.UK.zero.comparison.samples[,"Sample_Year"])
[1] "2004"
max(Canada.UK.zero.comparison.samples[,"Sample_Year"])
[1] "2019"
TPA.alignment.data.dist.melt.meta

Make a plot looking at pairwise distances in UK and Canada

TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta <- (TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$Country_combinations=="Canada___UK" | TPA.alignment.data.dist.melt.meta$Country_combinations=="Canada___Canada" |  TPA.alignment.data.dist.melt.meta$Country_combinations=="UK___UK"),])

TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta <- TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta$same.sample=="different",]

TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta$Country_combinations2 <- gsub("UK___UK","England",gsub("Canada___Canada","British Columbia",gsub("Canada___UK","British Columbia v.s. England",TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta$Country_combinations)))


# do distributions plots on PW SNPs for country comparisons
p.UK.canada.pwSNPs.sina <- ggplot(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta, aes(Country_combinations2, Distance)) +
  geom_sina(alpha=0.1,size=1, aes(color=Country_combinations2)) + 
  #geom_boxplot(alpha=0.01, outlier.shape = NA, width=0.25) +
  theme_light() +
  theme.text.size + 
  #scale_y_log10() +
  labs(x="Country Combination",y="Pairwise SNPs") +
  #scale_colour_manual(values=c("#74C476", "cyan3", "#084594")) +
  scale_colour_manual(values=c("#74C476", "grey50", "#084594")) + 
  theme(legend.position='none') +
  ggtitle("Pairwise SNPs within and between British Columbia (Canada) and England (UK)") +
  NULL
p.UK.canada.pwSNPs.sina


# Do distributions of pwSNPs v.s. timepoints
p.UK.canada.pwSNPs.vs.Time.points <- ggplot(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta, aes(year.distance,Distance)) +
  geom_point(size=1, alpha=0.1) +
  geom_density_2d() +
  theme_light() +
  facet_grid(.~Country_combinations2) + 
  scale_y_log10() +
  labs(y="Pairwise SNP distance (log10 scale)", x="Pairwise time distance (years)") +
  theme.text.size +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.x = element_text(color = "grey25", size=10))
#p.UK.canada.pwSNPs.vs.Time.points 

# Breakdown pairwise SNP/time distances by sublineage
TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta$TPA.pinecone.sublineage.t1 <- factor(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta$TPA.pinecone.sublineage.t1, levels=sublineages.cols.brew$sublineage)
p.UK.canada.pwSNPs.vs.Time.points.sublineage.breakdown <- ggplot(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta$same.TPA.Pinecone.cluster=="same",], aes(year.distance,Distance, color=TPA.pinecone.sublineage.t1)) +
  geom_point(size=2, alpha=0.25) +
  geom_density_2d(color="black", alpha=0.5) +
  theme_light() +
  facet_grid(TPA.pinecone.sublineage.t1~Country_combinations2) + 
  #scale_y_log10() +
  labs(y="Pairwise SNP distance (log10 scale)", x="Pairwise time distance (years)") +
  theme.text.size +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.x = element_text(color = "grey25", size=10),strip.text.y = element_text(color = "grey25", size=10)) +
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) + 
  NULL
p.UK.canada.pwSNPs.vs.Time.points.sublineage.breakdown







# do pwSNPs v.s. timepoints using a hexplot to reduce overplotting
p.UK.canada.pwSNPs.vs.Time.hex <- ggplot(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta, aes(year.distance,Distance)) +
  stat_bin_hex(colour="white", na.rm=TRUE, bins = 20) +
  scale_fill_gradientn(colours=c("purple","green"), 
                       name = "Comparison Frequency", breaks=c(1,30,1000),
                       na.value=NA, trans="log10") + 
  facet_grid(.~Country_combinations2) + 
  theme_light() +
  labs(y="Pairwise SNP distance", x="Pairwise time distance (years)") +
  theme.text.size + theme(legend.key.size = unit(0.75,"line")) +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.x = element_text(color = "grey25", size=10)) +
  theme(legend.position="bottom") +
  ggtitle("Pairwise SNPs and Years within and between British Columbia (Canada) and England (UK)") + theme(plot.title = element_text(size = 10))
#p.UK.canada.pwSNPs.vs.Time.hex




# do pwSNPs v.s. timepoints using a hexplot to reduce overplotting (by sublineage)
TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage <- TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta$same.TPA.Pinecone.cluster=="same",]

p.UK.canada.pwSNPs.vs.Time.hex.sublineage.breakdown <- ggplot(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage, aes(year.distance,Distance)) +
  stat_bin_hex(colour="white", na.rm=TRUE, bins = 15) +
  #geom_density_2d_filled() +
  #geom_density_2d_filled() + scale_fill_brewer(palette="PuRd") +
  scale_fill_gradientn(colours=c("purple","green"), 
                       name = "Comparison Frequency", breaks=c(1,30,1000),
                       na.value=NA, trans="log10") + 
  #geom_point(alpha=0.025, size=0.25) +
  #geom_density_2d(color="black", alpha=0.5, bins=4) +
  #facet_grid(TPA.pinecone.sublineage.t1~Country_combinations2) + 
  facet_grid(.~Country_combinations2) + 
  theme_light() +
  #scale_y_log10(breaks=c(0.01,1,5,10,20,40)) + scale_x_log10(breaks=c(0.01,1,5,10,20)) +
  #coord_cartesian(xlim=c(0.1,20), ylim=c(0.1,30)) +
  scale_y_continuous(breaks=seq(0,25,5)) +
  labs(y="Pairwise SNP distance", x="Pairwise time distance (years)") +
  theme.text.size + theme(legend.key.size = unit(0.75,"line")) +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.x=element_text(color="grey25", size=10), strip.text.y=element_text(color="grey25", size=10)) +
  theme(legend.position="bottom") +
  ggtitle("Pairwise SNPs (same sublineage) and Years within and between British Columbia (Canada) and England (UK)") + theme(plot.title = element_text(size = 10))


p.UK.canada.pwSNPs.vs.Time.hex.sublineage.breakdown <- p.UK.canada.pwSNPs.vs.Time.hex.sublineage.breakdown + stat_smooth(method='lm', fullrange=F,se=T, color='black', level=95)
#p.UK.canada.pwSNPs.vs.Time.hex.sublineage.breakdown + stat_smooth(fullrange=F,se=T, color='black', formula=log10(x) ~ log10(x))
#p.UK.canada.pwSNPs.vs.Time.hex.sublineage.breakdown

Look at temporal relationships a little more

# time distances at 0 SNPs
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance==0,"year.distance"])
[1] 2.452642
max(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance==0,"year.distance"])
[1] 15
min(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance==0,"year.distance"])
[1] 0
# mean SNP distance
max(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance)
[1] 26
# max SNP distance 
max(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance)
[1] 26
# time distances at 26 SNPs
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance==26,"year.distance"])
[1] 1
max(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance==26,"year.distance"])
[1] 1
min(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance==26,"year.distance"])
[1] 1
# time distances at 0 SNPs (Canada)
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance==0 & TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Country_combinations=="Canada___Canada"),"year.distance"])
[1] 2.8944
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance==0 & TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Country_combinations=="UK___UK"),"year.distance"])
[1] 1.915912
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance==0 & TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Country_combinations=="Canada___UK"),"year.distance"])
[1] 2.664378
##############
# SNP distances at 0 time
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance==0,"Distance"])
[1] 4.858593
max(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance==0,"Distance"])
[1] 23
min(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance==0,"Distance"])
[1] 0
max(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance)
[1] 19
# SNP distances at 19 years time
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance==19,"Distance"])
[1] 7.8
max(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance==19,"Distance"])
[1] 11
min(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance==19,"Distance"])
[1] 6
# SNP distances at 0 time (England)
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance==0 & TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Country_combinations=="UK___UK"),"Distance"])
[1] 5.303675
# SNP distances at 0 time (Canada)
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance==0 & TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Country_combinations=="Canada___Canada"),"Distance"])
[1] 2.599581
# SNP distances at 0 time (Both)
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance==0 & TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Country_combinations=="Canada___UK"),"Distance"])
[1] 4.504094

Look at formally testing for signal

Calculate Pearson’s correlation (for real dataset)

nrow(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage)
[1] 55841

Combine pwSNP data with rest of Canada/UK plots


plot.UK.BC.stats.vs.bubbleplot <- plot_grid(plot.UK.BC.stats.combined + x.theme.strip + ggtitle("Syphilis Incidence and sublineage count") +theme(plot.title = element_text(size = 10)), plot.TPA.sublineage_UK.Canada.temporal.counts.bubbleplot + theme(legend.position="bottom",strip.background = element_blank(),strip.text.x = element_blank()), ncol=1, align=T, rel_heights=c(1,2), labels=c('A','B'), label_size=11)

#plot.UK.BC.stats.vs.bubbleplot_vs_pairwise.SNPs <- plot_grid(plot.UK.BC.stats.vs.bubbleplot, p.UK.canada.pwSNPs.vs.Time.hex, ncol=2, rel_widths=c(5,6), labels=c('','C'), label_size=11)

plot.UK.BC.stats.vs.bubbleplot_vs_pairwise.SNPs <- plot_grid(plot.UK.BC.stats.vs.bubbleplot, p.UK.canada.pwSNPs.sina, ncol=2, rel_widths=c(7,5), labels=c('','C'), label_size=11)



plot.UK.BC.stats.vs.bubbleplot_vs_pairwise.SNPs

Combine to make a BC vs UK only plot


#plot.UK.BC.stats.vs.bubbleplot
#p.UK.canada.pwSNPs.vs.Time.hex.sublineage.breakdown


#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure13_Canada-vs-UK_distros_03-2021.svg"), width = 600, height = 1050,type="svg",units = "pt")
plot_grid(plot.UK.BC.stats.vs.bubbleplot, p.UK.canada.pwSNPs.sina, p.UK.canada.pwSNPs.vs.Time.hex.sublineage.breakdown, ncol=1, labels=c('','C','D'), label_size=11, scale=0.95, rel_heights=c(4,2,3))

#dev.off()

Combine Canada/UK analysis with Global Pairwise analysis



plot_grid(plot.UK.BC.stats.vs.bubbleplot_vs_pairwise.SNPs, multicountry.pairwise.snps.grid, ncol=1, scale=0.95)

Do pairwise SNPs within sublineages


#TPA.alignment.data.dist.melt.meta$Geo_Country.t1 <- factor(TPA.alignment.data.dist.melt.meta$Geo_Country.t1, levels=continental.country.cols.brew2$Geo_Country)

TPA.alignment.data.dist.melt.meta$TPA.pinecone.sublineage.t1 <- factor(TPA.alignment.data.dist.melt.meta$TPA.pinecone.sublineage.t1, levels=sublineages.cols.brew$sublineage)

#scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage)

p.pairwise.snps.within.Sublineage <- ggplot(TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.TPA.Pinecone.cluster=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different"),], aes(TPA.pinecone.sublineage.t1, Distance, color=TPA.pinecone.sublineage.t1)) +
  geom_violin(scale='width') +
  geom_sina(alpha=0.5, scale='width', method="d") + 
  theme_light() +
  theme.text.size + 
  #x.theme.axis.rotate +
  labs(x="Sublineage",y="Pairwise SNPs") +
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme(legend.key.size = unit(0.65,"line"),legend.position='none') + 
  ggtitle("Pairwise SNP distributions between samples from same sublineage") + 
  theme(plot.title = element_text(size = 10)) + 
  NULL


p.pairwise.snps.within.Sublineage

Just check the maximum pairwise SNPs within sublineages

max(TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.TPA.Pinecone.cluster=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different"),"Distance"])
[1] 26
max(TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.TPA.Pinecone.cluster=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different" & TPA.alignment.data.dist.melt.meta$TPA.pinecone.sublineage.t1==19),"Distance"])
no non-missing arguments to max; returning -Inf
[1] -Inf

networks of lineage sharing

Create a network of country links (simply based on cooccurrence of sublineages) - make a heatmap based on the linklist


country.combinations <- data.frame(t(combn(data.frame(continental.country.cols.brew2)[continental.country.cols.brew2$Geo_Country!="Belgium","Geo_Country"],2, simplify=T)),stringsAsFactors = F)
colnames(country.combinations) <- c("taxa1","taxa2")
country.combinations <- rbind(country.combinations,data.frame(taxa1=country.combinations$taxa2, taxa2=country.combinations$taxa1, stringsAsFactors = F))
country.combinations$combo <- paste0(country.combinations$taxa1,"___",country.combinations$taxa2)


sublineage.country.summary.simple <- data.frame(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage!="Singleton",] %>% dplyr::group_by(TPA.pinecone.sublineage,Geo_Country) %>% 
  dplyr::summarise(total.samples=n()), stringsAsFactors = F)
`summarise()` regrouping output by 'TPA.pinecone.sublineage' (override with `.groups` argument)
sublineage.country.summary.simple <- plyr::join(sublineage.country.summary.simple,unique(TPA.meta1.2.pinecone[,c("TPA.pinecone.sublineage","TPA.pinecone.major")]), by="TPA.pinecone.sublineage")


#sublineage.sharing.links.all <- sublineage.sharing.list
sublineage.sharing.links.all <- data.frame(sublineage.country.summary.simple %>% dplyr::group_by(TPA.pinecone.sublineage) %>% dplyr::summarise(no_links = length(TPA.pinecone.sublineage)),stringsAsFactors = F)
`summarise()` ungrouping output (override with `.groups` argument)
sublineage.sharing.links <- subset(sublineage.sharing.links.all, no_links>1)


# Make list of interaction combos (i.e. network edges)
linklist.all <- NULL
for (current in sublineage.sharing.links.all$TPA.pinecone.sublineage){
  current.sublineage <- subset(sublineage.country.summary.simple,TPA.pinecone.sublineage==current)
  if (nrow(current.sublineage)>1){
    current.sublineage1 <- data.frame(taxa1=t(combn(current.sublineage$Geo_Country, 2, FUN=NULL, simplify=T))[,1],taxa2=t(combn(current.sublineage$Geo_Country, 2, FUN=NULL, simplify=T))[,2],stringsAsFactors=T)
  current.sublineage1$sublineage <- current
  linklist.all <- rbind(linklist.all, current.sublineage1)
  }
}

linklist.all <- data.frame(linklist.all, stringsAsFactors=F)
linklist.all$combo <- paste0(linklist.all$taxa1,"___",linklist.all$taxa2)
linklist.all$combo2 <- paste0(linklist.all$taxa2,"___",linklist.all$taxa1)


linklist.all <- rbind(data.frame(taxa1=linklist.all$taxa1,taxa2=linklist.all$taxa2, sublineage=linklist.all$sublineage,combo=linklist.all$combo, stringsAsFactors =F), data.frame(taxa1=linklist.all$taxa2,taxa2=linklist.all$taxa1, sublineage=linklist.all$sublineage,combo=linklist.all$combo2, stringsAsFactors =F))


linklist.all.frequency <- linklist.all %>% dplyr::group_by(combo, .drop=F) %>%
  dplyr::summarise(Sublineage.Count=n())
`summarise()` ungrouping output (override with `.groups` argument)
linklist.all <- plyr::join(linklist.all,linklist.all.frequency, by="combo", type='left')

linklist.all2 <- plyr::join(country.combinations[,c("taxa1","taxa2","combo")],linklist.all[,c("taxa1","taxa2","combo","Sublineage.Count")], type="left")
Joining by: taxa1, taxa2, combo
linklist.all2 <- linklist.all2[rev(order(linklist.all2$combo, linklist.all2$Sublineage.Count)),]
linklist.all2 <-linklist.all2[!duplicated(linklist.all2),]
linklist.all <- linklist.all2


linklist.all[is.na(linklist.all$Sublineage.Count),"Sublineage.Count"] <- 0


linklist.all$taxa1 <- factor(linklist.all$taxa1, levels=continental.country.cols.brew2$Geo_Country)
linklist.all$taxa2 <- factor(linklist.all$taxa2, levels=continental.country.cols.brew2$Geo_Country)

p.country.sublineage.lings.heatmap <- ggplot(linklist.all, aes(taxa1, taxa2, fill=Sublineage.Count)) +
  geom_tile(color="white") +
  scale_fill_gradient(low="#ffffcc",high="#2c7fb8", name="Shared\nsublineages") + 
  #theme_classic() + 
  theme_light() + 
  theme.text.size + 
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5), axis.title.x = element_blank(),
        axis.title.y = element_blank()) +
  geom_text(aes(label = Sublineage.Count), color = "black", size = 2.5) +
  theme(axis.text.x = element_text(colour=data.frame(continental.country.cols.brew2)[continental.country.cols.brew2$Geo_Country!="Belgium","country.col"]), axis.text.y = element_text(colour=data.frame(continental.country.cols.brew2)[continental.country.cols.brew2$Geo_Country!="Belgium","country.col"])) +
  theme(legend.key.size = unit(0.75,"line")) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  ggtitle("Number of sublineages shared between countries") + theme(plot.title = element_text(size = 10))
Vectorized input to `element_text()` is not officially supported.
Results may be unexpected or may change in future versions of ggplot2.Vectorized input to `element_text()` is not officially supported.
Results may be unexpected or may change in future versions of ggplot2.
  
p.country.sublineage.lings.heatmap

Plot lineage sharing network with classified lineage counts


p.shared.sublineage.legend <- plot_grid(get_legend(plot.classified.sublineages.per.country + theme(legend.position='right')), get_legend(p.country.sublineage.lings.heatmap + theme(legend.position='right')), ncol=1, rel_heights=c(2,3))

p.shared.sublineage.plotgrid <-  plot_grid(plot.classified.sublineages.per.country + theme(legend.position="none") + ggtitle("Number of sublineages in each country") + theme(plot.title = element_text(size=10)) + x.theme.strip, p.country.sublineage.lings.heatmap + theme(legend.position="none"), align=T, ncol=1, rel_heights=c(2,3),labels=c('A','B'), label_size=11)

plot_grid(p.shared.sublineage.plotgrid,p.shared.sublineage.legend, ncol=2, rel_widths=c(4,1))

NA
NA

Try plot a different way (02-2021) to focus on global analysis alone


# Sort labels
plot.pairwise.SNPs.combi.5 <- plot_grid(plot.classified.sublineages.per.country + x.theme.strip + theme(legend.position="none") + labs(y="Sublineage Count"), p.hq.country.barplot + x.theme.strip + ggtitle('Sample count by country') + theme(plot.title=element_text(size=10)), 
                                        p.country.minsnp.heatmap + theme(legend.position="none") + ggtitle('Minimum pairwise SNP distance between countries') + theme(plot.title=element_text(size=10)), ncol=1, rel_heights=c(1,1,4), align="v", axis="lr", labels=c('A','B','D'), label_size=11)

plot.pairwise.SNPs.combi.5 <- plot_grid(plot.pairwise.SNPs.combi.5, p.pairwise.snps.withinCountry.within.all.Lineage + theme(legend.position='none'), ncol=2, labels=c('','C'), label_size=11, scale=0.95)



# Sort labels
plot.pairwise.SNPs.combi.6 <- plot_grid(plot.classified.sublineages.per.country + x.theme.strip + theme(legend.position="none") + labs(y="Sublineage Count"), p.hq.country.barplot + ggtitle('Sample count by country') + theme(plot.title=element_text(size=10)), 
                                        p.country.minsnp.heatmap + theme(legend.position="none") + ggtitle('Minimum pairwise SNP distance between countries') + theme(plot.title=element_text(size=10)), ncol=1, rel_heights=c(1,2,4), align="v", axis="lr", labels=c('A','B','D'), label_size=11)

plot.pairwise.SNPs.combi.6 <- plot_grid(plot.pairwise.SNPs.combi.6, p.pairwise.snps.withinCountry.within.all.Lineage + theme(legend.position='none'), ncol=2, labels=c('','C'), label_size=11, scale=0.95)




pwSNPs.legend <- get_legend(p.pairwise.snps.withinCountry.within.all.Lineage + theme(legend.position='top'))
pwSNPs.heatmap.legend <- get_legend(p.country.minsnp.heatmap + theme(legend.position='top'))
sublineages.legend <- get_legend(plot.classified.sublineages.per.country + theme(legend.position='top') + guides(fill=guide_legend(nrow=3)))
countries.bar.legend <- get_legend(p.hq.country.barplot + theme(legend.position='top') + guides(fill=guide_legend(nrow=6)))

Figure3.combi.legend <- plot_grid(sublineages.legend,pwSNPs.heatmap.legend,countries.bar.legend, nrow=1, rel_widths=c(2,1,3))

pwSNPs.legend.vert <- get_legend(p.pairwise.snps.withinCountry.within.all.Lineage + theme(legend.position='left'))
pwSNPs.heatmap.legend.vert <- get_legend(p.country.minsnp.heatmap + theme(legend.position='left'))
sublineages.legend.vert <- get_legend(plot.classified.sublineages.per.country + theme(legend.position='left') + guides(fill=guide_legend(ncol=1)))
countries.bar.legend.vert <- get_legend(p.hq.country.barplot + theme(legend.position='left') + guides(fill=guide_legend(ncol=1)))

Figure3.combi.legend.vert <- plot_grid(sublineages.legend.vert,pwSNPs.heatmap.legend.vert,countries.bar.legend.vert, ncol=1, rel_heights=c(2,1,3))



####
# Version 7 of this complex and much commented on plot ;-) 

Figure3.combi.legend.mixit1 <- plot_grid(sublineages.legend,countries.bar.legend, nrow=2, rel_heights=c(2,3))

# Sort Figure and labels
plot.pairwise.SNPs.combi.7 <- plot_grid(plot.classified.sublineages.per.country + x.theme.strip + theme(legend.position="none") + labs(y="Sublineage Count"), p.hq.country.barplot + x.theme.strip + ggtitle('Sample count by country') + theme(plot.title=element_text(size=10)), 
                                        p.country.minsnp.heatmap + theme(legend.position="none") + ggtitle('Minimum pairwise SNP distance between countries') + theme(plot.title=element_text(size=10)), ncol=1, rel_heights=c(1,1,4), align="v", axis="lr", labels=c('A','B','D'), label_size=11)
plot.pairwise.SNPs.combi.7.a <- plot_grid(plot.pairwise.SNPs.combi.7, pwSNPs.heatmap.legend, rel_heights=c(11,1),ncol=1)

plot.pairwise.SNPs.combi.7.b <- plot_grid(Figure3.combi.legend.mixit1,p.pairwise.snps.withinCountry.within.all.Lineage + theme(legend.position='none'),ncol=1, rel_heights=c(1,4), labels=c('','C'), label_size=11, scale=0.95)  

plot.pairwise.SNPs.combi.7.c <- plot_grid(plot.pairwise.SNPs.combi.7.a, plot.pairwise.SNPs.combi.7.b, ncol=2)






#Cairo::Cairo(file=paste0(Figure_output_directory, "Figure4_Sublin+PairwiseSNPs__Global-distro_02-2021.svg"), width = 800, height = 800,type="svg",units = "pt")
plot.pairwise.SNPs.combi.7.c

#dev.off()


# need to do a tree that only highlights singleton or private sublineages


private.singleton.lineages <- data.frame(Private.country.counts[Private.country.counts$private.distro=="private",c("TPA.pinecone.sublineage","private.distro")])

private.singleton.lineages <- rbind(private.singleton.lineages,data.frame(TPA.pinecone.sublineage="Singleton",private.distro="Singleton",stringsAsFactors = F))
private.singleton.lineages$private.sublineages <- private.singleton.lineages$TPA.pinecone.sublineage
private.singleton.lineages <- private.singleton.lineages[,c("TPA.pinecone.sublineage","private.sublineages")]

private.singleton.samples <-  plyr::join(TPA.meta1.2.pinecone[,c("Sample_Name","TPA.pinecone.sublineage")],private.singleton.lineages, type="left", by="TPA.pinecone.sublineage")

private.singleton.samples <- data.frame(row.names=private.singleton.samples$Sample_Name, "Private or Singleton\nSublineage"=private.singleton.samples$private.sublineages)


# add private lineage strip to tree 
p.TPA.MLtree.sublineages.privatelineage <- gheatmap(TPA.MLtree.ggtree.tippoint + theme(legend.position="none"),
               private.singleton.samples, color='grey70',width=0.075,offset=0.00000725, colnames_angle=-45,colnames_offset_y=0, hjust=0,font.size=2) + 
  scale_fill_manual(name="Private & Singleton\nSublineages",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right')
p.TPA.MLtree.sublineages.privatelineage <- p.TPA.MLtree.sublineages.privatelineage + new_scale_fill()


p.TPA.MLtree.sublineages.privatelineage <-gheatmap(p.TPA.MLtree.sublineages.privatelineage,TPA.rawseq.countries.p, color='grey70',width=0.075,offset=0.00001725, colnames_angle=-45,colnames_offset_y=0, hjust=0,font.size=2) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right') +
  NULL



#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure11__MLtree_private+singleton_02-2021.svg"), width = 800, height = 800,type="svg",units = "pt")
p.TPA.MLtree.sublineages.privatelineage

#dev.off()

Need to make a subtree highlighting the Reference strains from Nichols lineage

#Nichols.coll + geom_text2(aes(subset=!isTip, label=node), hjust=-.3, size=2.5) 
#ggtree(Nichols.ref.subtree.nodeid.tree) + geom_text2(aes(subset=!isTip, label=node), hjust=-.3) 

#Nichols.reference.clade.Year.data <- data.frame(row.names=TPA.meta1.2.pinecone.havedates$Sample_Name, TPA.meta1.2.pinecone.havedates$Sample_Year,stringsAsFactors=F)


Nichols.ref.subtree.nodeid <- 976
#Nichols.ref.subtree.nodeid <- 979

Nichols.ref.subtree.nodeid.tree <- tree_subset(TPA.MLtree, node=Nichols.ref.subtree.nodeid,levels_back=0)
#ggtree(Nichols.ref.subtree.nodeid.tree) + geom_tiplab(size=2.5) 



p.Nichols.ref.subtree.nodeid.tree <- ggtree(Nichols.ref.subtree.nodeid.tree) %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage) + 
  geom_tippoint(aes(color=Sublineage), size=2.5, alpha=0.25,show.legend=F) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  # add bootstrap support
  geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=2, shape=18) +
  geom_tiplab(size=2.5,align=F) + 
  geom_treescale(fontsize = 2.5, x=0.00001, y=12) +
  #xlim(0, 0.00007) +
  NULL

p.Nichols.ref.subtree.nodeid.tree.hm <- gheatmap(p.Nichols.ref.subtree.nodeid.tree,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage), color=NULL,width=0.065,offset=0.0000095, colnames_angle=0,colnames_offset_y=-0.01, font.size=2) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.75,"line"),legend.position='right') +
  geom_rootedge(0.00000075) 
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
p.Nichols.ref.subtree.nodeid.tree.hm <- p.Nichols.ref.subtree.nodeid.tree.hm + new_scale_fill()


p.Nichols.ref.subtree.nodeid.tree.hm <- gheatmap(p.Nichols.ref.subtree.nodeid.tree.hm, data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Year=TPA.meta1.2.pinecone$Sample_5year.window, stringsAsFactors = F), color=NULL,width=0.065,offset=0.0000135, colnames_angle=0,colnames_offset_y=-0.01, font.size=2) + scale_fill_manual(name="Year",values=TPA.5year.window.brewcols$window.5year.cols[1:(length(TPA.5year.window.brewcols$window.5year.cols)-1)], breaks=TPA.5year.window.brewcols$window.5year[1:(length(TPA.5year.window.brewcols$window.5year)-1)]) #+
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
  #theme.text.size + theme(legend.key.size = unit(0.75,"line"),legend.position='left')


p.Nichols.ref.subtree.nodeid.tree.hm <- p.Nichols.ref.subtree.nodeid.tree.hm + new_scale_fill()
p.Nichols.ref.subtree.nodeid.tree.hm

NA
NA

Plot main Nichols tree, but with highlight for relevant clade

p.TPA.Nichols.coll.highlight <- p.TPA.Nichols.coll + new_scale_fill()
p.TPA.Nichols.coll.highlight <- p.TPA.Nichols.coll.highlight  + geom_hilight(node=Nichols.ref.subtree.nodeid, alpha=0.2, fill="grey45")
#p.TPA.Nichols.coll.highlight

# Add sample year (group)
p.TPA.Nichols.coll.highlight <- gheatmap(p.TPA.Nichols.coll.highlight,data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Year=TPA.meta1.2.pinecone$Sample_5year.window, stringsAsFactors = F), color=NULL,width=0.085,offset=0.00002025, colnames_angle=0,colnames_offset_y=-1, font.size=2) + scale_fill_manual(name="Year",values=TPA.5year.window.brewcols$window.5year.cols[1:(length(TPA.5year.window.brewcols$window.5year.cols)-1)], breaks=TPA.5year.window.brewcols$window.5year[1:(length(TPA.5year.window.brewcols$window.5year)-1)]) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='left')
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
p.TPA.Nichols.coll.highlight <- p.TPA.Nichols.coll.highlight + new_scale_fill()

plot Nichols trees together


p.Nichols.ref.subtree.nodeid.tree.hm.grid <- plot_grid(NULL,p.Nichols.ref.subtree.nodeid.tree.hm + theme(legend.position="none") + ggtitle("Subtree") + theme(plot.title = element_text(size = 10)), NULL, rel_heights=c(1,4,1),ncol=1, labels=c('','B',''), label_size=11, vjust=0)

p.Nichols.ref.subtree.nodeid.tree.hm.grid.final <- plot_grid(p.TPA.Nichols.coll.highlight + theme(legend.position="left") + ggtitle("Nichols-lineage phylogeny") + theme(plot.title = element_text(size = 10)),NULL,p.Nichols.ref.subtree.nodeid.tree.hm.grid, ncol=3, rel_widths=c(10,1,10), labels=c('A','',''), vjust=1, label_size=11)


#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure6_MLtree_Nichols-reference-highlight__02-2021.svg"), width = 800, height = 500,type="svg",units = "pt")
p.Nichols.ref.subtree.nodeid.tree.hm.grid.final

#dev.off()

subset Nichols to show outgroups better


# sublineage 19 
sublineages.tocollapse.nodeid.19 <- 997
# Sublineage 14 (formerly called 19 in older analysis before bootstrapping)
sublineages.tocollapse.nodeid.14 <- 997

# sublineage 12 
sublineages.tocollapse.nodeid.12 <- 963



# Collapse SS14 clade and largest Nichols sublineage to make for easier viewing
Nichols.coll.2clades <- ggtree(TPA.MLtree) %>% collapse(node=SS14.subtree.nodeid) %>% 
  collapse(node=sublineages.tocollapse.nodeid.19) #%>%
  #collapse(node=sublineages.tocollapse.nodeid.12) 

# Add some extra to y axis for spacing
Nichols.coll.2clades$data[Nichols.coll.2clades$data$node==SS14.subtree.nodeid,"y"] <- Nichols.coll.2clades$data[Nichols.coll.2clades$data$node==SS14.subtree.nodeid,"y"] + 8

Nichols.coll.2clades$data[Nichols.coll.2clades$data$node==sublineages.tocollapse.nodeid.14,"y"] <- Nichols.coll.2clades$data[Nichols.coll.2clades$data$node==sublineages.tocollapse.nodeid.14,"y"] + 3

#Nichols.coll.2clades$data[Nichols.coll.2clades$data$node==sublineages.tocollapse.nodeid.12,"y"] <- Nichols.coll.2clades$data[Nichols.coll.2clades$data$node==sublineages.tocollapse.nodeid.12,"y"] + 5
#sublineages.tocollapse.nodeid.12


# Add first triangle
Nichols.coll.2clades <- Nichols.coll.2clades + geom_text2(aes(subset=(node == SS14.subtree.nodeid)), size=20, label=intToUtf8(9664), hjust=0.2,vjust=.55, family="OpenSansEmoji", color="indianred1", alpha=.75)
Nichols.coll.2clades <- Nichols.coll.2clades  + geom_text2(aes(subset=(node == SS14.subtree.nodeid)), cex=2.5, vjust=0.2, label="SS14",hjust = -1.5)


# Add second triangle
#Nichols.coll.2clades <- Nichols.coll.2clades + geom_text2(aes(subset=(node == sublineages.tocollapse.nodeid.19)), size=20, label=intToUtf8(9664), hjust=0.2,vjust=.55, family="OpenSansEmoji", color=sublineages.cols.brew[sublineages.cols.brew$sublineage==19,"sublineage.cols"], alpha=.75)
#Nichols.coll.2clades <- Nichols.coll.2clades  + geom_text2(aes(subset=(node == sublineages.tocollapse.nodeid.19)), cex=2.5, vjust=0.2, label="Sublineage 19",hjust = -0.5)

Nichols.coll.2clades <- Nichols.coll.2clades + geom_text2(aes(subset=(node == sublineages.tocollapse.nodeid.14)), size=20, label=intToUtf8(9664), hjust=0.2,vjust=.55, family="OpenSansEmoji", color=sublineages.cols.brew[sublineages.cols.brew$sublineage==14,"sublineage.cols"], alpha=.75)
Nichols.coll.2clades <- Nichols.coll.2clades  + geom_text2(aes(subset=(node == sublineages.tocollapse.nodeid.14)), cex=2.5, vjust=0.2, label="Sublineage 14",hjust = -0.5)



p.Nichols.coll.2clades <- Nichols.coll.2clades %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage) +
  geom_tippoint(aes(color=Sublineage), size=2.5, alpha=0.25,show.legend=F) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  # add bootstrap support
  geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=2, shape=18) +
  geom_tiplab(size=2.5,align=F) + 
  geom_treescale(fontsize = 2.5, x=0.00001, y=25) + 
  geom_tiplab(size=2.5) + 
  geom_hilight(node=1055, alpha=0.25, fill=sublineages.cols.brew[sublineages.cols.brew$sublineage==6,"sublineage.cols"]) +
  geom_hilight(node=957, alpha=0.25, fill=sublineages.cols.brew[sublineages.cols.brew$sublineage==7,"sublineage.cols"])

p.Nichols.coll.2clades <- p.Nichols.coll.2clades + new_scale_fill()

p.Nichols.coll.2clades.hm <- gheatmap(p.Nichols.coll.2clades,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage), color=NULL,width=0.06,offset=0.0000095, colnames_angle=0,colnames_offset_y=-0.01, font.size=2.25) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right')

p.Nichols.coll.2clades.hm <- p.Nichols.coll.2clades.hm + new_scale_fill()

p.Nichols.coll.2clades.hm <- gheatmap(p.Nichols.coll.2clades.hm,TPA.rawseq.countries.p, color=NULL,width=0.06,offset=0.0000165, colnames_angle=0,colnames_offset_y=-0.01, font.size=2.25) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='bottom') + 
  #geom_treescale(fontsize = 2.5, x=0.000001, y=35) +
  NULL
p.Nichols.coll.2clades.hm <- p.Nichols.coll.2clades.hm + new_scale_fill()


p.Nichols.coll.2clades.hm <- gheatmap(p.Nichols.coll.2clades.hm, data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Year=TPA.meta1.2.pinecone$Sample_5year.window, stringsAsFactors = F), color=NULL,width=0.06,offset=0.0000235, colnames_angle=0,colnames_offset_y=-0.01, font.size=2.25) + scale_fill_manual(name="Year",values=TPA.5year.window.brewcols$window.5year.cols[1:(length(TPA.5year.window.brewcols$window.5year.cols)-1)], breaks=TPA.5year.window.brewcols$window.5year[1:(length(TPA.5year.window.brewcols$window.5year)-1)]) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='left') + 
  ylim(-1,57)

p.TPA.Nichols.coll.highlight <- p.TPA.Nichols.coll.highlight + new_scale_fill()



#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure5_MLtree_highlight-outgroups__02-2021.svg"), width = 700, height = 500,type="svg",units = "pt")
p.Nichols.coll.2clades.hm + ggtitle("Nichols-lineage phylogeny with collapsed nodes") + theme(plot.title = element_text(size = 10))

#dev.off()

Subset SS14 tree to show outgroups


#SS14.coll + geom_text2(aes(subset=!isTip, label=node), hjust=-.3, size=2.5)

#ss14.sublin2.collapse.node <- 542
ss14.sublin1.collapse.node <- 534 #535

SS14.subclades1 <- ggtree(TPA.MLtree) %>% collapse(node=Nichols.subtree.nodeid) %>% 
  collapse(node=ss14.sublin1.collapse.node)
# Add some extra to y axis for spacing
SS14.subclades1$data[SS14.subclades1$data$node==Nichols.subtree.nodeid,"y"] <- SS14.subclades1$data[SS14.subclades1$data$node==Nichols.subtree.nodeid,"y"] -5

SS14.subclades1$data[SS14.subclades1$data$node==ss14.sublin1.collapse.node,"y"] <- SS14.subclades1$data[SS14.subclades1$data$node==ss14.sublin1.collapse.node,"y"] + 2

# Add first triangle
SS14.subclades1 <- SS14.subclades1 + geom_text2(aes(subset=(node == Nichols.subtree.nodeid)), size=20, label=intToUtf8(9664), hjust=0.2,vjust=.55, family="OpenSansEmoji", color="royalblue2", alpha=.75)
SS14.subclades1 <- SS14.subclades1 + geom_text2(aes(subset=(node == Nichols.subtree.nodeid)), cex=2.5, vjust=0.2, label="Nichols",hjust = -1.25)
# Add second triangle
SS14.subclades1 <- SS14.subclades1 + geom_text2(aes(subset=(node == ss14.sublin1.collapse.node)), size=20, label=intToUtf8(9664), hjust=0.2,vjust=.55, family="OpenSansEmoji", color=sublineages.cols.brew[sublineages.cols.brew$sublineage==1,"sublineage.cols"], alpha=.75)
SS14.subclades1 <- SS14.subclades1 + geom_text2(aes(subset=(node == ss14.sublin1.collapse.node)), cex=2, vjust=0.2, label="Sublineage 1",hjust = -0.75)


# add tippoint colours (sublineage)
p.SS14.subclades1 <- SS14.subclades1 %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage) +
  geom_tippoint(aes(color=Sublineage), size=1.5, alpha=0.25,show.legend=F) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  # add bootstrap support
  geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=2, shape=18) +
  geom_treescale(fontsize = 2.5, x=0.00001, y=25)  
p.SS14.subclades1 <- p.SS14.subclades1 + new_scale_fill()

# add heatmap strips (sublineage)
p.SS14.subclades1.hm <- gheatmap(p.SS14.subclades1,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage), color=NULL,width=0.06,offset=0.0000095, colnames_angle=0,colnames_offset_y=-0.01, font.size=2.25) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right')
p.SS14.subclades1.hm <- p.SS14.subclades1.hm + new_scale_fill()

# add heatmap strips (country)
p.SS14.subclades1.hm <- gheatmap(p.SS14.subclades1.hm,TPA.rawseq.countries.p, color=NULL,width=0.06,offset=0.0000155, colnames_angle=0,colnames_offset_y=-0.01, font.size=2.25) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='bottom') +
  geom_tiplab(size=2) +
  NULL
p.SS14.subclades1.hm <- p.SS14.subclades1.hm + new_scale_fill()

# Add year group
p.SS14.subclades1.hm <- gheatmap(p.SS14.subclades1.hm,data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Year=TPA.meta1.2.pinecone$Sample_5year.window, stringsAsFactors = F), color=NULL,width=0.065,offset=0.0000215, colnames_angle=0,colnames_offset_y=-0.01, font.size=2.25) + scale_fill_manual(name="Year",values=TPA.5year.window.brewcols$window.5year.cols[1:(length(TPA.5year.window.brewcols$window.5year.cols)-1)], breaks=TPA.5year.window.brewcols$window.5year[1:(length(TPA.5year.window.brewcols$window.5year)-1)]) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='left') +
  ylim(-6,67) #ylim(-6,115)  
p.SS14.subclades1.hm <- p.SS14.subclades1.hm + new_scale_fill()


#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure4_MLtree_highlight-SS14__02-2021.svg"), width = 700, height = 800,type="svg",units = "pt")
p.SS14.subclades1.hm + ggtitle("SS14-lineage phylogeny") + theme(plot.title=element_text(size = 10))

#dev.off()




BEAST analysis



Subsampling representative tree for BEAST analyis


Country.list2 <- data.frame(continental.country.cols.brew2,stringsAsFactors = F)[continental.country.cols.brew2$Geo_Country!="Belgium","Geo_Country"]
sublineage.lineage.list2 <- sublineages.cols.brew[sublineages.cols.brew$sublineage!="Singleton","sublineage"]
#Full.samplelist2 <- TPA.meta1.2.pinecone$Sample_Name
Full.samplelist2 <- TPA.meta1.2.pinecone[((TPA.meta1.2.pinecone$Sample_Year!="-") & !grepl("Nichols",TPA.meta1.2.pinecone$Sample_Name) & !grepl("-",TPA.meta1.2.pinecone$Sample_Year)),"Sample_Name"]
TPA.meta1.2.pinecone.havedates <- TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Sample_Name %in% Full.samplelist2,]


mysamplesize <- 5
total.bootstraps <- 1

all.bootstraps.lists <- NULL
all.bootstraps.trees <- c(rtree(20)) # create with random start tree to force into a multiphylo object
current.bootstrap <- 0
repeat {
  country.sample <- NULL
  for (current.lineage in sublineage.lineage.list2){
    for (current.country in Country.list2){
      current.list <- TPA.meta1.2.pinecone.havedates[(TPA.meta1.2.pinecone.havedates$TPA.pinecone.sublineage==current.lineage & TPA.meta1.2.pinecone.havedates$Geo_Country==current.country),"Sample_Name"]
      current.list <- current.list[!is.na(current.list)]
      if (length(current.list)>1){
        current.sample <- unique(sample(current.list,size=mysamplesize, replace=T))
        country.sample <- c(as.vector(current.sample), country.sample)
      }
    }
  }
  country.sample <- c(country.sample, as.vector(TPA.meta1.2.pinecone.havedates[(TPA.meta1.2.pinecone.havedates$TPA.pinecone.sublineage=="Singleton"),"Sample_Name"]))
  
  current.bootstrap <- current.bootstrap + 1
  current.sample.tree <- (ape::keep.tip(TPA.MLtree, country.sample))
  all.bootstraps.trees <- c(all.bootstraps.trees,c(current.sample.tree),recursive=T)
  all.bootstraps.lists <- c(all.bootstraps.lists, list(country.sample))
  if (current.bootstrap == total.bootstraps){
    break
  }
}
all.bootstraps.trees <- all.bootstraps.trees[c(2:(total.bootstraps+1))] # remove random start tree

#all.bootstraps.trees[[1]]
#subsampled.ML.tips.6pCountry.pSublin <- all.bootstraps.trees[[1]]$tip.label

# Outputs from a previous random loop used below to ensure sample remains the same
subsampled.ML.tips.6pCountry.pSublin <- c("Mexico_A-mcf", "TPA_RUS_Tuva-62", "TPA_RUS_Tuva-58", "TPA_RUS_Tuva-59", "TPA_RUS_Tuva-61", "PHE140073A", "UW116B", "UW186B", "UW213B", "PHE150137A", "PHE150129A", "TPA_UKBRG017", "TPA_BCC161", "TPA_OMI006", "TPA_ALC105", "UW187B", "PHE150177A", "TPA_BCC085", "K3", "SHE_V", "C3", "Q3", "TPA_BCC075", "TPA_USL-BAL-2", "SS14_v2", "UW824B", "PHE140084A", "TPA_HUN200024", "TPA_HUN190022", "TPA_UKBRG004", "TPA_HUN190008", "UW099B", "TPA_USL-SEA-81-8", "TPA_ZIM025", "TPA_ZIM005", "TPA_ZIM009", "TPA_ZIM007", "UW262B", "UW391B", "TPA_BCC139", "TPA_BCC137", "TPA_UKBRG015", "TPA_UKBRG018", "PHE130041A", "TPA_BCC130", "UW376B", "PHE150159A", "UW244B", "UW291B", "TPA_BCC125", "PT_SIF1002", "PT_SIF1196", "PT_SIF0857", "PHE160254A", "PT_SIF1183", "PHE160249A", "PHE170379A", "PT_SIF1020", "PT_SIF1063", "PHE160315A", "PHE170398A", "PHE170380A", "PHE170365A", "UW148B", "UW473B", "UW492B", "UW248B", "TPA_HUN190023", "UW138B", "UW368B", "UW149B", "UW104B", "SMUTp_02", "SMUTp_01", "SMUTp_08", "PT_SIF0954", "PT_SIF1200", "TPA_BCC128", "TPA_BCC127", "TPA_AUSBR-45", "TPA_HUN180007", "PHE160246A", "UW327B", "CW87", "TPA_ESBCN002", "TPA_SWE-467", "TPA_EIR017", "TPA_HUN190017", "PHE160248A", "PHE130053A", "TPA_HUN180001", "TPA_SWE-662", "PHE130051A", "TPA_BCC138", "TPA_AUSBR-113", "TPA_SWE-1352", "PT_SIF0877_3", "PT_SIF1142", "TPA_EIR013", "AU15", "TPA_EIR008", "AU16", "TPA_BCC049", "CW84", "TPA_BCC052", "PHE170392A", "Seattle_81-4", "TPA_USL-SEA-83-2", "UW279B", "TPA_USL-SEA-86-1", "PHE170336B", "PHE150114A", "TPA_OMI021", "CW59", "CW82", "TPA_USL-Phil-3", "BAL3", "BAL73", "TPA_ZIM014", "TPA_ZIM018", "TPA_ZIM015", "PHE150166A", "PHE160306A", "PHE150168A", "TPA_BCC122", "PHE160294A", "TPA_HUN180004", "TPA_BCC136", "TPA_HUN190020", "PHE160287A", "TPA_BCC126", "TPA_BCC169", "TPA_BCC012", "PHE120029A", "PHE120033A", "TPA_USL-Haiti-B", "PHE160283A", "PHE130048A")



subsampled.ML.tips.6pCountry.pSublin.tree <- (ape::keep.tip(TPA.MLtree, subsampled.ML.tips.6pCountry.pSublin))

subsampled.ML.tips.6pCountry.pSublin.tree.data <- data.frame(fortify(subsampled.ML.tips.6pCountry.pSublin.tree),stringsAsFactors = F)
subsampled.ML.tips.6pCountry.pSublin.tree.data$Sample_Name <- subsampled.ML.tips.6pCountry.pSublin.tree.data$label 
subsampled.ML.tips.6pCountry.pSublin.tree.data <- plyr::join(subsampled.ML.tips.6pCountry.pSublin.tree.data, TPA.meta1.2.pinecone.havedates[,c("Sample_Name","TPA.pinecone.major","TPA.pinecone.sublineage", "Sample_Year")], by="Sample_Name", type="left")




plot.subsampled.ML.tips.6pCountry.pSublin.tree <- ggtree(subsampled.ML.tips.6pCountry.pSublin.tree) %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage) + 
  geom_tippoint(aes(color=Sublineage), size=1, alpha=0.5) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) 

plot.subsampled.ML.tips.6pCountry.pSublin.tree <- gheatmap(plot.subsampled.ML.tips.6pCountry.pSublin.tree,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage), color=NULL,width=0.04, colnames_angle=0,colnames_offset_y=-1, font.size=2.5) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.5,"line"))
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.

plot subsampled ML subtree

plot.subsampled.ML.tips.6pCountry.pSublin.tree

Define function to extract and plot root-2-tip data from a tree or subtree

# Inputs
# - A maximum likelihood tree in phylo
# - a dataframe with the headers c("Sample_Name","Sample_Year")

plotRootToTip <- function(input.ml.tree, input.dates.df){
  tree.data <- data.frame(ggtree::fortify(input.ml.tree),stringsAsFactors = F)
  tree.data$Sample_Name <- tree.data$label
  tree.data <- plyr::join(tree.data, input.dates.df, by="Sample_Name", type="left")
  RootTotipDistances <- tree.data[tree.data$isTip==TRUE,"x"]
  treeLabels <- tree.data[tree.data$isTip==TRUE,"Sample_Name"]
  ntips <- length(treeLabels)
  treeDates <- as.numeric(tree.data[tree.data$isTip==TRUE,"Sample_Year"])
  maxdate <- max(treeDates)
  mindate <- min(treeDates)
  treeModel <- lm(RootTotipDistances ~ treeDates)
  treeCorrelation <- cor.test(treeDates, RootTotipDistances, method = "pearson", conf.level = 0.95)
  modelSummary <- summary(treeModel)
  xIntercept <- -coef(treeModel)[1]/coef(treeModel)[2]
  RootTotipDF <- data.frame(RootTotipDistances,treeDates)
  
  plot.tree.data.root2tip <- ggplot(data = RootTotipDF, aes(treeDates,  RootTotipDistances)) +
    geom_point(alpha=0.25,size=2, colour = "red") +
    theme_classic() +
    labs(x="Year",y="Root to tip distance") +
    #geom_smooth(method='lm',fullrange=T,se=T) +
    stat_smooth(method='lm',fullrange=T,se=T) +  
    ggtitle(paste0("Slope: ",formatC(modelSummary$coefficients[2], format = "e", digits = 3),"; ",
                  "TMRCA: ",round(xIntercept,1),
                  "\n", "Correlation Coefficient: ",round(treeCorrelation$estimate,3),
                  "; ", "R^2: ", format(modelSummary$r.squared,digits=3),"\n",ntips," tips","; Timespan: ",mindate,"-",maxdate)) +
    theme(plot.title = element_text(size = 9))
  
  return(plot.tree.data.root2tip)
}

now plot

p.subsampled.ML.tips.6pCountry.pSublin.root2tip <- plotRootToTip(subsampled.ML.tips.6pCountry.pSublin.tree, TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])

p.subsampled.ML.tips.6pCountry.pSublin.root2tip.2versions <- plot_grid(p.subsampled.ML.tips.6pCountry.pSublin.root2tip, p.subsampled.ML.tips.6pCountry.pSublin.root2tip + scale_x_continuous(limits = c(1400,2020)) + coord_cartesian(xlim=c(1400,2020), ylim=c(0,8.5e-5)), ncol=2, labels=c('B','C'), label_size=11) 
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
p.subsampled.ML.tips.6pCountry.pSublin.root2tip.2versions 

Plot ML Subtree with root-2-tip graph


#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure15__ML-subtree1-with-root2tip_02-2021.svg"), width = 800, height = 800,type="svg",units = "pt")
plot_grid(plot.subsampled.ML.tips.6pCountry.pSublin.tree +ylim(-1,139), p.subsampled.ML.tips.6pCountry.pSublin.root2tip.2versions, ncol=1, scale=0.9, labels=c('A',''), label_size=11)

#dev.off()

Took sequence alignment from this tree, and analysed using BEAST 1.8.4

BEAST analysis (HYK subst model), comparing

Strict - Constant pop Strict - Skyline pop (10 cats) RelLogNormal - Constant pop RelLogNormal - Exponential pop RelLogNormal - Skyline pop (10 cats)

  • Can’t reject strict clock (ucldev.sd overlaps zero subtantially)
  • Stepping stone analysis shows Strick-Skyline is best model (although Strict Constant is nearly as close)

Bring in Strict Skyline BEAST tree and plot

# Bring in beast tree and extract tree data into dataframe
TPA.beast.tree <- read.beast(TPA.beast.subtree.file)
TPA.beast.tree.data <- data.frame(fortify(TPA.beast.tree),stringsAsFactors = F)


# BEAST tipnames have date included - lets remove that for plotting with metadata
TPA.beast.tipnames <- data.frame(beast.name=TPA.beast.tree@phylo$tip.label,stringsAsFactors = F)
TPA.beast.tipnames$meta.name <- gsub("\\|.+$","", TPA.beast.tipnames$beast.name)
TPA.beast.tree@phylo$tip.label <- TPA.beast.tipnames$meta.name
#TPA.beast.tree@phylo$tip.label


# Build plot
TPA.beast.plot1 <- ggtree(TPA.beast.tree,mrsd="2019-06-01",ladderize = T) + 
  theme_tree2() +
  # Add date lines for easy interpretation  
  scale_x_continuous(breaks=c(1300,1400,1500,1600,1700,1800,1850,1900,1925,1950,1975,2000,2020), minor_breaks=seq(1950, 2020, 5)) +
  theme(panel.grid.major   = element_line(color="grey50", size=.2),
        panel.grid.minor   = element_line(color="grey85", size=.2),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) 
# Add posterior support as node points
TPA.beast.plot1 <- TPA.beast.plot1 + geom_point2(aes(subset=(!isTip & as.numeric(posterior)>0.8)),color="gray60",size=2.5,alpha=0.5, shape=18) + 
  geom_point2(aes(subset=(!isTip & as.numeric(posterior)>0.91)),color="gray40",size=2.5,shape=18,alpha=0.5) + 
  geom_point2(aes(subset=(!isTip & as.numeric(posterior)>=0.96)),color="black",size=2.5,shape=18,alpha=0.5)


# Plot 95% HPD intervals - geom_range seems unable to do correctly with this tree (known bug for tip dated trees), so extract data and plot using geom_segment
minmax <- t(matrix(unlist(TPA.beast.tree.data[!is.na(TPA.beast.tree.data$height_0.95_HPD),"height_0.95_HPD"]),nrow=2))
bar_df <- data.frame(node_id=TPA.beast.tree.data[!is.na(TPA.beast.tree.data$height_0.95_HPD),"node"],as.data.frame(minmax))
names(bar_df) <- c('node_id','min','max') 
bar_df <- bar_df %>% filter(node_id > Ntip(TPA.beast.tree@phylo))
bar_df <- bar_df %>% left_join(TPA.beast.plot1$data, by=c('node_id'='node')) %>% select(node_id,min,max,y)
mrcd.decimal <- decimal_date(as.Date("2019-06-01","%Y-%m-%d"))
TPA.beast.plot1 <- TPA.beast.plot1 + geom_segment(aes(x=mrcd.decimal-max, y=y, xend=mrcd.decimal-min, yend=y), data=bar_df, color='red', alpha=0.2, size=2.25)

TPA.beast.plot1 <- TPA.beast.plot1 + new_scale_fill()

TPA.beast.plot1 + geom_tiplab(size=2, align=T)

with metadata


TPA.beast.plot1.meta <- gheatmap(TPA.beast.plot1,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage), color=NULL,width=0.05, offset=2,colnames_angle=0,colnames_offset_y=-1, font.size=2.5) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size
TPA.beast.plot1.meta <- TPA.beast.plot1.meta + new_scale_fill()


TPA.beast.plot1.meta <- gheatmap(TPA.beast.plot1.meta,TPA.rawseq.countries.p, color=NULL,width=0.05,offset=36, colnames_angle=0,colnames_offset_y=-1, font.size=2.5) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size
TPA.beast.plot1.meta <- TPA.beast.plot1.meta + new_scale_fill()
beast.country.legend <- get_legend(TPA.beast.plot1.meta + theme(legend.key.size = unit(0.65,"line"),legend.position='right'))

TPA.beast.plot1.meta <- gheatmap(TPA.beast.plot1.meta,data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Year=TPA.meta1.2.pinecone$Sample_5year.window, stringsAsFactors = F), color=NULL,width=0.05, offset=70,colnames_angle=0,colnames_offset_y=-1, font.size=2.5) + scale_fill_manual(name="Year",values=TPA.5year.window.brewcols$window.5year.cols[1:(length(TPA.5year.window.brewcols$window.5year.cols)-1)], breaks=TPA.5year.window.brewcols$window.5year[1:(length(TPA.5year.window.brewcols$window.5year)-1)]) +
  theme.text.size + 
  #theme(legend.key.size = unit(0.65,"line"),legend.position='left') #+
  #guides(fill=guide_legend(ncol=3)) +
  NULL
TPA.beast.plot1.meta <- TPA.beast.plot1.meta + new_scale_fill()
beast.year.legend <- get_legend(TPA.beast.plot1.meta + theme(legend.key.size = unit(0.65,"line"),legend.position='right'))


TPA.beast.plot1.meta <- TPA.beast.plot1.meta + geom_vline(xintercept = 2000, color='blue', alpha=0.5)
TPA.beast.plot1.meta <- TPA.beast.plot1.meta + annotate("rect",xmin=2000,xmax=2020,ymin=-1,ymax=138,alpha=0.1, fill='blue')

Look at and plot skyline data

beast.subtree.skyline <- read.table(beast.subtree.skyline.file, sep="\t", check.names=F, comment.char="", header=T, stringsAsFactors=F)

p.beast.subtree.skyline <- ggplot(beast.subtree.skyline, aes(Time,Median)) + 
  geom_line() +
  geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
  theme_light() + 
  scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) +
  scale_y_log10() + 
  coord_cartesian(x=c(1750,2020), y=c(50,3000)) + 
  theme.text.size +
  #labs(y="Median effective population size", x="Year") + 
  labs(y="Relative genetic diversity", x="Year") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) +
  annotate("rect",xmin=2000,xmax=2020,ymin=0,ymax=10000,alpha=0.1, fill='blue')
#p.beast.subtree.skyline

Look at lineage preditions

beast.subtree.skyline.lineages <- read.table(beast.subtree.skyline.lineage.file, sep="\t", check.names=F, comment.char="", header=T, stringsAsFactors=F)

p.beast.subtree.skyline.lineages <- ggplot(beast.subtree.skyline.lineages, aes(Time,Median)) + 
  geom_line() +
  geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
  theme_light() + 
  scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) +
  scale_y_log10() + 
  coord_cartesian(x=c(1750,2020), y=c(1,300)) + 
  theme.text.size +
  labs(y="Lineages", x="Year") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) +
  annotate("rect",xmin=2000,xmax=2020,ymin=0,ymax=500,alpha=0.1, fill='blue')
#p.beast.subtree.skyline.lineages

Make combined plot

skyline.row <- plot_grid(NULL,p.beast.subtree.skyline,NULL, ncol=3, rel_widths = c(1,3,1))
lineage.row <- plot_grid(NULL,p.beast.subtree.skyline.lineages,NULL, ncol=3, rel_widths = c(1,3,1))

#both.skyline.lineage.rows <- plot_grid(skyline.row, lineage.row, align=T, ncol=1, labels=c('B','C'))
both.skyline.lineage.rows <- plot_grid(skyline.row, align=T, ncol=1, labels=c('B'),label_size=11)


#plot.beast.with.skyline <- plot_grid(TPA.beast.plot1.meta + theme(legend.position='none') + ylim(-1,138), both.skyline.lineage.rows, ncol=1, align=T, rel_heights = c(2,2),labels=c('A',''), label_size=11)

plot.beast.with.skyline <- plot_grid(TPA.beast.plot1.meta + theme(legend.position='none') + ylim(-1,138), both.skyline.lineage.rows, ncol=1, align=T, rel_heights = c(3,1),labels=c('A',''), label_size=11)

beast.legend.combined <- plot_grid(beast.year.legend, NULL, ncol=1, rel_heights=c(3,1))



#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure16_subsample138_BEAST-StrictCSkyline_02-2021.svg"), width = 800, height = 800,type="svg",units = "pt")
plot_grid(beast.legend.combined, plot.beast.with.skyline, ncol=2, rel_widths=c(1,10), labels=c('Key',''), label_size=11)

#dev.off()

Pull out MRCA nodes and date ranges from beast subtree and sublineages

# Subset metadata
TPA.meta1.2.beast.subset1 <- TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Sample_Name %in% as.phylo(TPA.beast.tree)$tip.label,]
# Define key sublineages
#Expanded.sublineages <- data.frame(Sublineage=c(2,3,5,6,12,19), stringsAsFactors = F)
Expanded.sublineages <- data.frame(Sublineage=c(1,2,8,14), stringsAsFactors = F)


# Run loop to extract MRCA node for each sublineage
Expanded.sublineage.nodes <- NULL
for (current.sublineage.exp1 in Expanded.sublineages$Sublineage) {
  Expanded.sublineage.nodes <- c(Expanded.sublineage.nodes, ape::getMRCA(as.phylo(TPA.beast.tree),as.character(TPA.meta1.2.beast.subset1[TPA.meta1.2.beast.subset1$TPA.pinecone.sublineage==current.sublineage.exp1,"Sample_Name"])))
}
Expanded.sublineages$node <- Expanded.sublineage.nodes
#Expanded.sublineages

# Split sublineage 2 in this tree creates a problem. Change it to 180
#Expanded.sublineages[Expanded.sublineages$sublineage==2,]
Expanded.sublineages[Expanded.sublineages$Sublineage==2,"node"] <- 180


TPA.beast.plot1 + geom_text2(aes(subset=!isTip, label=node), hjust=-.3, size=2) +
  geom_point2(aes(subset=(node %in% Expanded.sublineages$node)),color="red") +
  ggtitle("Red nodes indicate MRCA for each sublineage")

#Extract relevant nodes data from beast tree data
Expanded.sublineage.nodes.beast <- plyr::join(Expanded.sublineages,TPA.beast.tree.data[,c("node","height","height_0.95_HPD","height_median","height_range")], by="node")

# Data is in the form of "height" information - need to convert to years relative to mrcd (2019/06/01)
Expanded.sublineage.nodes.beast$mrca.median <- 2019.5 - Expanded.sublineage.nodes.beast$height_median
Expanded.sublineage.nodes.beast$year <- as.numeric(round(2019.5 - Expanded.sublineage.nodes.beast$height_median,0))


Expanded.sublineage.nodes.beast$mrca.95high <- round(2019.5 - sapply(1:nrow(Expanded.sublineage.nodes.beast),function(x) as.numeric(unlist(Expanded.sublineage.nodes.beast[x,"height_0.95_HPD"]))[1]))

Expanded.sublineage.nodes.beast$mrca.95low <- round(2019.5 - sapply(1:nrow(Expanded.sublineage.nodes.beast),function(x) as.numeric(unlist(Expanded.sublineage.nodes.beast[x,"height_0.95_HPD"]))[2]))


Expanded.sublineage.nodes.beast <- Expanded.sublineage.nodes.beast[order(Expanded.sublineage.nodes.beast$Sublineage),]
Expanded.sublineage.nodes.beast$Sublineage <- factor(Expanded.sublineage.nodes.beast$Sublineage, levels=rev(sublineages.cols.brew$sublineage))
Expanded.sublineage.nodes.beast$sub.order <- rev(c(1:nrow(Expanded.sublineage.nodes.beast)))

Repeat BEAST subsampling, but ensure all year are represented (and make a slightly bigger tree)


year.list <- sort(unique(TPA.meta1.2.pinecone.havedates$Sample_Year))


mysamplesize.2 <- 3
total.bootstraps.2 <- 1

all.bootstraps.lists.2 <- NULL
all.bootstraps.trees.2 <- c(rtree(20)) # create with random start tree to force into a multiphylo object
current.bootstrap.2 <- 0
repeat {
  country.sample.2 <- NULL
  for (current.year.2 in year.list) { 
    for (current.lineage.2 in sublineage.lineage.list2){
      for (current.country.2 in Country.list2){
        current.list.2 <- TPA.meta1.2.pinecone.havedates[(TPA.meta1.2.pinecone.havedates$TPA.pinecone.sublineage==current.lineage.2 & TPA.meta1.2.pinecone.havedates$Geo_Country==current.country.2 & TPA.meta1.2.pinecone.havedates$Sample_Year==current.year.2),"Sample_Name"]
        current.list.2 <- current.list.2[!is.na(current.list.2)]
        if (length(current.list.2)>1){
          current.sample.2 <- unique(sample(current.list.2,size=mysamplesize.2, replace=T))
          country.sample.2 <- c(as.vector(current.sample.2), country.sample.2)
        }
      }
    }
  }
  country.sample.2 <- c(country.sample.2, as.vector(TPA.meta1.2.pinecone.havedates[(TPA.meta1.2.pinecone.havedates$TPA.pinecone.sublineage=="Singleton"),"Sample_Name"]))
  
  current.bootstrap.2 <- current.bootstrap.2 + 1
  current.sample.tree.2 <- (ape::keep.tip(TPA.MLtree, country.sample.2))
  all.bootstraps.trees.2 <- c(all.bootstraps.trees.2,c(current.sample.tree.2),recursive=T)
  all.bootstraps.lists.2 <- c(all.bootstraps.lists.2, list(country.sample.2))
  if (current.bootstrap.2 == total.bootstraps.2){
    break
  }
}
all.bootstraps.trees.2 <- all.bootstraps.trees.2[c(2:(total.bootstraps.2+1))] # remove random start tree

#all.bootstraps.trees[[1]]
#subsampled.ML.tips.3pv.country.dates.sublin <- all.bootstraps.trees.2[[1]]$tip.label

# Outputs from a previous random loop used below to ensure sample remains the same
subsampled.ML.tips.3pv.country.dates.sublin <- c("Mexico_A-mcf", "TPA_RUS_Tuva-62", "TPA_RUS_Tuva-39", "TPA_RUS_Tuva-61", "PHE140073A", "UW337B", "UW186B", "UW330B", "UW213B", "UW215B", "PHE150129A", "TPA_UKBRG017", "TPA_BCC161", "TPA_OMI006", "UW231B", "UW187B", "K3", "SHE_V", "SHG_I2", "C3", "TPA_BCC165", "TPA_USL-BAL-2", "SS14_v2", "TPA_HUN200024", "TPA_HUN190022", "TPA_USL-SEA-81-8", "TPA_ZIM025", "TPA_ZIM019", "TPA_ZIM007", "UW262B", "UW391B", "PHE150151A", "TPA_UKBIR026", "TPA_OMI002", "TPA_BCC139", "TPA_BCC137", "TPA_UKBRG015", "PHE130041A", "PHE130050A", "UW376B", "PHE150159A", "UW280B", "UW244B", "TPA_BCC125", "PHE160254A", "PT_SIF1183", "PHE160249A", "PHE170379A", "PHE160315A", "PHE160260A", "PHE150131A", "PHE170402A", "PHE170380A", "UW473B", "UW492B", "TPA_USL-Phil-1", "PHE120030A", "PHE120014A", "PHE130043A", "PT_SIF1167", "PHE140093A", "TPA_HUN190023", "TPA_BCC058", "TPA_USL-SEA-87-1", "TPA_UKBIR050", "UW155B", "UW211B", "TPA_BCC038", "TPA_BCC034", "TPA_BCC008", "TPA_BCC009", "UW257B", "UW138B", "UW102B", "UW344B", "UW126B", "SMUTp_01", "SMUTp_08", "UW383B", "TPA_ALC115", "UW823B", "PHE170385A", "UW304B", "PT_SIF0954", "PT_SIF1200", "TPA_BCC128", "PHE130056A", "TPA_ALC036", "TPA_BCC032", "TPA_BCC132", "TPA_AUSBR-45", "TPA_BCC153", "TPA_AUSBR-39", "TPA_HUN180007", "UW411B", "TPA_ESBCN002", "PHE140074A", "TPA_SWE-467", "PHE150126A", "PHE170412A", "PHE150173A", "TPA_UKBRG009", "TPA_ESBCN004", "TPA_BCC123", "TPA_BCC147", "TPA_BCC063", "PHE120024A", "TPA_BCC174", "TPA_SWE-575", "TPA_HUN190017", "PHE160248A", "TPA_BCC176", "TPA_HUN180001", "PHE130054A", "UW852B", "PHE140081A", "UW259B", "TPA_AUSBR-113", "PHE160301A", "TPA_BCC102", "PT_SIF1278", "PT_SIF0877_3", "PHE160265A", "AU15", "TPA_EIR008", "TPA_ALC055", "TPA_BCC175", "PT_SIF1280", "TPA_BCC185", "TPA_BCC111", "TPA_BCC157", "TPA_BCC030", "PHE170409A", "TPA_BCC049", "TPA_BCC061", "TPA_BCC052", "PHE150161A", "TPA_ALC126", "TPA_OMI022", "UW189B", "UW279B", "TPA_OMI021", "PHE170401A", "CW59", "CW82", "BAL73", "TPA_ZIM015", "TPA_ZIM020", "PHE150118A", "PHE170333A", "PHE160302A", "PHE140089A", "PHE170381A", "PHE160263A", "TPA_BCC089", "PHE160316A", "TPA_BCC122", "PHE170386A", "PHE140076A", "PHE150149A", "TPA_BCC136", "PHE150170A", "TPA_UKBRG007", "TPA_BCC081", "TPA_BCC012", "PHE120029A", "PHE120033A", "TPA_USL-Haiti-B")



subsampled.ML.tips.3pv.country.dates.sublin.tree <- (ape::keep.tip(TPA.MLtree, subsampled.ML.tips.3pv.country.dates.sublin))

subsampled.ML.tips.3pv.country.dates.sublin.tree.data <- data.frame(fortify(subsampled.ML.tips.3pv.country.dates.sublin.tree),stringsAsFactors = F)
subsampled.ML.tips.3pv.country.dates.sublin.tree.data$Sample_Name <- subsampled.ML.tips.3pv.country.dates.sublin.tree.data$label 
subsampled.ML.tips.3pv.country.dates.sublin.tree.data <- plyr::join(subsampled.ML.tips.3pv.country.dates.sublin.tree.data, TPA.meta1.2.pinecone.havedates[,c("Sample_Name","TPA.pinecone.major","TPA.pinecone.sublineage", "Sample_Year")], by="Sample_Name", type="left")

subsampled.metalist2 <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Sample_Name %in% subsampled.ML.tips.3pv.country.dates.sublin),c("Sample_Name","Cleaned_fastq_id","Sample_Year")] 


plot.subsampled.ML.tips.3pv.country.dates.sublin.tree <- ggtree(subsampled.ML.tips.3pv.country.dates.sublin.tree) %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage) + 
  geom_tippoint(aes(color=Sublineage), size=1, alpha=0.5) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) 

plot.subsampled.ML.tips.3pv.country.dates.sublin.tree <- gheatmap(plot.subsampled.ML.tips.3pv.country.dates.sublin.tree,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage), color=NULL,width=0.04, colnames_angle=0,colnames_offset_y=-1, font.size=2.5) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.5,"line"))
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
plot.subsampled.ML.tips.3pv.country.dates.sublin.tree

do root2tip for second subsampled tree

p.subsampled.ML.tips.3pv.country.dates.sublin.root2tip <- plotRootToTip(subsampled.ML.tips.3pv.country.dates.sublin.tree, TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])


plot_grid(p.subsampled.ML.tips.3pv.country.dates.sublin.root2tip, p.subsampled.ML.tips.3pv.country.dates.sublin.root2tip + scale_x_continuous(limits = c(1400,2020)) + coord_cartesian(xlim=c(1400,2020), ylim=c(0,8.5e-5)), ncol=2)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'

Plot repeat subsetted tree and skyline info

repeat.subsampled.skyline.tree <- read.beast(repeat.subsampled.skyline.tree.file)
repeat.subsampled.skyline.tree.data <- data.frame(fortify(repeat.subsampled.skyline.tree),stringsAsFactors = F)

# BEAST tipnames have date included - lets remove that for plotting with metadata
repeat.subsampled.skyline.tree.tipnames <- data.frame(beast.name=repeat.subsampled.skyline.tree@phylo$tip.label,stringsAsFactors = F)
repeat.subsampled.skyline.tree.tipnames$meta.name <- gsub("\\|.+$","", repeat.subsampled.skyline.tree.tipnames$beast.name)
repeat.subsampled.skyline.tree@phylo$tip.label <- repeat.subsampled.skyline.tree.tipnames$meta.name

# Build plot
repeat.subsampled.skyline.plot1 <- ggtree(repeat.subsampled.skyline.tree,mrsd="2019-06-01",ladderize = T) + 
  theme_tree2() +
  # Add date lines for easy interpretation  
  scale_x_continuous(breaks=c(1300,1400,1500,1600,1700,1800,1850,1900,1925,1950,1975,2000,2020), minor_breaks=seq(1950, 2020, 5)) +
  theme(panel.grid.major   = element_line(color="grey50", size=.2),
        panel.grid.minor   = element_line(color="grey85", size=.2),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) 
# Add posterior support as node points
repeat.subsampled.skyline.plot1 <- repeat.subsampled.skyline.plot1 + geom_point2(aes(subset=(!isTip & as.numeric(posterior)>0.8)),color="gray60",size=2.5,alpha=0.5, shape=18) + 
  geom_point2(aes(subset=(!isTip & as.numeric(posterior)>0.91)),color="gray40",size=2.5,shape=18,alpha=0.5) + 
  geom_point2(aes(subset=(!isTip & as.numeric(posterior)>=0.96)),color="black",size=2.5,shape=18,alpha=0.5)

# Plot 95% HPD intervals - geom_range seems unable to do correctly with this tree (known bug for tip dated trees), so extract data and plot using geom_segment
rep.minmax <- t(matrix(unlist(repeat.subsampled.skyline.tree.data[!is.na(repeat.subsampled.skyline.tree.data$height_0.95_HPD),"height_0.95_HPD"]),nrow=2))
rep.bar_df <- data.frame(node_id=repeat.subsampled.skyline.tree.data[!is.na(repeat.subsampled.skyline.tree.data$height_0.95_HPD),"node"],as.data.frame(rep.minmax))
names(rep.bar_df) <- c('node_id','min','max') 
rep.bar_df <- rep.bar_df %>% filter(node_id > Ntip(repeat.subsampled.skyline.tree@phylo))
rep.bar_df <- rep.bar_df %>% left_join(repeat.subsampled.skyline.plot1$data, by=c('node_id'='node')) %>% select(node_id,min,max,y)
rep.mrcd.decimal <- decimal_date(as.Date("2019-06-01","%Y-%m-%d"))
repeat.subsampled.skyline.plot1 <- repeat.subsampled.skyline.plot1 + geom_segment(aes(x=rep.mrcd.decimal-max, y=y, xend=rep.mrcd.decimal-min, yend=y), data=rep.bar_df, color='red', alpha=0.2, size=2.25)
repeat.subsampled.skyline.plot1 <- repeat.subsampled.skyline.plot1 + new_scale_fill()

# Add markers after 2000
repeat.subsampled.skyline.plot1 <- repeat.subsampled.skyline.plot1 + geom_vline(xintercept = 2000, color='blue', alpha=0.5) + 
  annotate("rect",xmin=2000,xmax=2020,ymin=-1,ymax=170,alpha=0.1, fill='blue')

repeat.subsampled.skyline.plot1 + geom_tiplab(size=1.5, align=T)

NA
NA
NA
# Skyline
rep.beast.subtree.skyline <- read.table(repeat.subsampled.skyline.data.file, sep="\t", check.names=F, comment.char="", header=T, stringsAsFactors=F)
p.rep.beast.subtree.skyline <- ggplot(rep.beast.subtree.skyline, aes(Time,Median)) + 
  geom_line() + geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
  theme_light() + 
  scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + scale_y_log10() + 
  coord_cartesian(x=c(1750,2020), y=c(50,3000)) + 
  theme.text.size + labs(y="Relative genetic diversity", x="Year") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) + annotate("rect",xmin=2000,xmax=2020,ymin=0,ymax=10000,alpha=0.1, fill='blue')

# Lineages
rep.beast.subtree.skyline.lineages <- read.table(repeat.subsampled.skyline.lineages.data.file, sep="\t", check.names=F, comment.char="", header=T, stringsAsFactors=F)
p.rep.beast.subtree.skyline.lineages <- ggplot(rep.beast.subtree.skyline.lineages, aes(Time,Median)) + 
  geom_line() + geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
  theme_light() + 
  scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + scale_y_log10() + 
  coord_cartesian(x=c(1750,2020), y=c(1,300)) + 
  theme.text.size + labs(y="Lineages", x="Year") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) + annotate("rect",xmin=2000,xmax=2020,ymin=0,ymax=500,alpha=0.1, fill='blue')
#p.rep.beast.subtree.skyline.lineages



# Make combined plot
rep.skyline.row <- plot_grid(NULL,p.rep.beast.subtree.skyline,NULL, ncol=3, rel_widths = c(1,3,1))
rep.lineage.row <- plot_grid(NULL,p.rep.beast.subtree.skyline.lineages,NULL, ncol=3, rel_widths = c(1,3,1))
#rep.both.skyline.lineage.rows <- plot_grid(rep.skyline.row, rep.lineage.row, align=T, ncol=1, labels=c('B','C'))
rep.both.skyline.lineage.rows <- plot_grid(rep.skyline.row, align=T, ncol=1, labels=c('B'),label_size=11)

#plot.beast.with.skyline.repeat <- plot_grid(repeat.subsampled.skyline.plot1 + theme(legend.position='none'), rep.both.skyline.lineage.rows, ncol=1, align=T, rel_heights = c(2,2),labels=c('A',''), label_size=11)

plot.beast.with.skyline.repeat <- plot_grid(repeat.subsampled.skyline.plot1 + theme(legend.position='none'), rep.both.skyline.lineage.rows, ncol=1, align=T, rel_heights = c(3,1),labels=c('A',''), label_size=11)



#plot.beast.with.skyline <- plot_grid(TPA.beast.plot1.meta + theme(legend.position='none'), skyline.row, ncol=1, align=T, rel_heights = c(2,1),labels=c('A','B'), label_size=11)
#beast.legend.combined <- plot_grid(beast.year.legend, NULL, ncol=1, rel_heights=c(3,1))


#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure17_subsample2-168_BEAST_StrictCSkyline_02-2021.svg"), width = 800, height = 800,type="svg",units = "pt")
plot.beast.with.skyline.repeat 

#dev.off()

Plotting Root-to-tip

# Define function
plot.subtree.R2T <- function(maintree, tips, dates.df){
  subtree.temp  <- ape::keep.tip(maintree, as.character(tips))
  plotRootToTip(subtree.temp, dates.df[,c("Sample_Name","Sample_Year")])
  plot_grid(ggtree(subtree.temp),plotRootToTip(subtree.temp, dates.df[,c("Sample_Name","Sample_Year")]),ncol=2)
}

Full tree dataset for BEAST1/2:

full.beast2.tree <- read.beast(full.beast2.tree.file)
tree.subsampled.ML.tips.full.data.for.beast2 <- (ape::keep.tip(TPA.MLtree, gsub("\\|.+$","",full.beast2.tree@phylo$tip.label, perl =T)))

tree.subsampled.ML.tips.full.data.for.beast2.r2t <- plot.subtree.R2T(tree.subsampled.ML.tips.full.data.for.beast2, gsub("\\|.+$","",full.beast2.tree@phylo$tip.label), TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])
`geom_smooth()` using formula 'y ~ x'
#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure18__ML-tree.fulltree520-with-root2tip_02-2021.svg"), width = 800, height = 600,type="svg",units = "pt")
tree.subsampled.ML.tips.full.data.for.beast2.r2t

#dev.off()


Pull in data for full BEAST2 tree

Full Beast tree

full.beast2.tree <- read.beast(full.beast2.tree.file)
full.beast2.tree.data <- data.frame(fortify(full.beast2.tree),stringsAsFactors = F)


# Lets sort out the date of various events - relate everything in years
full.beast2.tree.data$mrca.median <- 2019.5 - full.beast2.tree.data$height_median
full.beast2.tree.data$year <- as.numeric(round(2019.5 - full.beast2.tree.data$height_median,0))

full.beast2.tree.data$mrca.95high <- round(2019.5 - sapply(1:nrow(full.beast2.tree.data),function(x) as.numeric(unlist(full.beast2.tree.data[x,"height_0.95_HPD"]))[1]))

full.beast2.tree.data$mrca.95low <- round(2019.5 - sapply(1:nrow(full.beast2.tree.data),function(x) as.numeric(unlist(full.beast2.tree.data[x,"height_0.95_HPD"]))[2]))


# Join metadata
full.beast2.tree.data$Sample_Name <- gsub("\\|.+$","", full.beast2.tree.data$label)
full.beast2.tree.data.meta <- plyr::join(full.beast2.tree.data, TPA.meta1.2.pinecone[,c("Sample_Name", "Geo_Country","TPA.pinecone.sublineage","Sample_Year")], by="Sample_Name", type="left")

full.beast2.tree.data.meta <- full.beast2.tree.data.meta[full.beast2.tree.data.meta$isTip==T,]

Extract some key dates


# Plot tree with node labels (to visualise and identify key nodes)
ggtree(full.beast2.tree) + geom_text2(aes(subset=!isTip, label=node), hjust=-.3, size=2.5, color="blue") 

# Split sublineage 1 in this tree creates a problem. Change it to 528 (MRCA of main clade)
Expanded.sublineages[Expanded.sublineages$Sublineage==1,"node"] <- 528



# Extract dates for the MRCAs - when did those sublineages first arise? Pretty old!
plyr::join(Expanded.sublineages, full.beast2.tree.data[,c("node","mrca.median","year","mrca.95high","mrca.95low")], by="node")



ggtree(full.beast2.tree,mrsd="2019-06-01",ladderize = T) + geom_text2(aes(subset=!isTip, label=node), hjust=-.3, size=2) +
  geom_point2(aes(subset=(node %in% Expanded.sublineages$node)),color="blue") +
  ggtitle("Red nodes indicate MRCA for each sublineage")

Lets plot the BEAST tree

# BEAST tipnames have date included - lets remove that for plotting with metadata
full.beast2.tipnames <- data.frame(beast.name=full.beast2.tree@phylo$tip.label,stringsAsFactors = F)
full.beast2.tipnames$meta.name <- gsub("\\|.+$","", full.beast2.tipnames$beast.name)
full.beast2.tree@phylo$tip.label <- full.beast2.tipnames$meta.name
#TPA.beast.tree@phylo$tip.label


# Build plot
full.beast2.tree.plot1 <- ggtree(full.beast2.tree,mrsd="2019-06-01",ladderize = T) + 
  theme_tree2() +
  # Add date lines for easy interpretation  
  scale_x_continuous(breaks=c(1300,1400,1500,1600,1700,1800,1850,1900,1925,1950,1975,2000,2020), minor_breaks=seq(1950, 2020, 5)) +
  theme(panel.grid.major   = element_line(color="grey50", size=.2),
        panel.grid.minor   = element_line(color="grey85", size=.2),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) 
# Add posterior support as node points
full.beast2.tree.plot1 <- full.beast2.tree.plot1 + geom_point2(aes(subset=(!isTip & as.numeric(posterior)>0.8)),color="gray60",size=2.5,alpha=0.5, shape=18) + 
  geom_point2(aes(subset=(!isTip & as.numeric(posterior)>0.91)),color="gray40",size=2.75,shape=18,alpha=0.5) + 
  geom_point2(aes(subset=(!isTip & as.numeric(posterior)>=0.96)),color="black",size=2.75,shape=18,alpha=0.5) +
    geom_rootedge(rootedge=50)


# Plot 95% HPD intervals - geom_range seems unable to do correctly with this tree (known bug for tip dated trees), so extract data and plot using geom_segment
minmax.fulltree <- t(matrix(unlist(full.beast2.tree.data[!is.na(full.beast2.tree.data$height_0.95_HPD),"height_0.95_HPD"]),nrow=2))
bar_df.fulltree <- data.frame(node_id=full.beast2.tree.data[!is.na(full.beast2.tree.data$height_0.95_HPD),"node"],as.data.frame(minmax.fulltree))
names(bar_df.fulltree) <- c('node_id','min','max') 
bar_df.fulltree <- bar_df.fulltree %>% filter(node_id > Ntip(full.beast2.tree@phylo))
bar_df.fulltree <- bar_df.fulltree %>% left_join(full.beast2.tree.plot1$data, by=c('node_id'='node')) %>% select(node_id,min,max,y)
mrcd.decimal.fulltree <- decimal_date(as.Date("2019-06-01","%Y-%m-%d"))
full.beast2.tree.plot1 <- full.beast2.tree.plot1 + geom_segment(aes(x=mrcd.decimal.fulltree-max, y=y, xend=mrcd.decimal.fulltree-min, yend=y), data=bar_df.fulltree, color='red', alpha=0.2, size=1.5)

full.beast2.tree.plot1 <- full.beast2.tree.plot1 + new_scale_fill()
#full.beast2.tree.plot1 #+ geom_tiplab(size=2, align=T)

# Add Metadata
full.beast2.tree.plot1.meta <- gheatmap(full.beast2.tree.plot1,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage), color=NULL,width=0.05, offset=2,colnames_angle=0,colnames_offset_y=-1.5, font.size=2.25,) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size
full.beast2.tree.plot1.meta <- full.beast2.tree.plot1.meta + new_scale_fill()

full.beast2.tree.plot1.meta <- gheatmap(full.beast2.tree.plot1.meta,TPA.rawseq.countries.p, color=NULL,width=0.05,offset=36, colnames_angle=0,colnames_offset_y=-1.5, font.size=2.25) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size
full.beast2.tree.plot1.meta <- full.beast2.tree.plot1.meta + new_scale_fill()
#beast.country.legend <- get_legend(TPA.beast.plot1.meta + theme(legend.key.size = unit(0.65,"line"),legend.position='right'))

full.beast2.tree.plot1.meta <- gheatmap(full.beast2.tree.plot1.meta,data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Year=TPA.meta1.2.pinecone$Sample_5year.window, stringsAsFactors = F), color=NULL,width=0.05, offset=70,colnames_angle=0,colnames_offset_y=-1.5, font.size=2.25) + scale_fill_manual(name="Year",values=TPA.5year.window.brewcols$window.5year.cols[1:(length(TPA.5year.window.brewcols$window.5year.cols)-1)], breaks=TPA.5year.window.brewcols$window.5year[1:(length(TPA.5year.window.brewcols$window.5year)-1)]) +
  theme.text.size + theme(legend.key.size = unit(0.55,"line"), legend.position="left") +
  coord_cartesian(y=c(-5,522)) +
  NULL
full.beast2.tree.plot1.meta <- full.beast2.tree.plot1.meta + new_scale_fill()
#full.beast2.tree.plot1.meta.legend <- get_legend(full.beast2.tree.plot1.meta + theme(legend.key.size = unit(0.65,"line"),legend.position='right'))


full.beast2.tree.plot1.meta <- full.beast2.tree.plot1.meta + geom_vline(xintercept = 2000, color='blue', alpha=0.5)
full.beast2.tree.plot1.meta <- full.beast2.tree.plot1.meta + annotate("rect",xmin=2000,xmax=2020,ymin=-4,ymax=521,alpha=0.1, fill='blue')


full.beast2.tree.plot1.meta #+ geom_text2(aes(subset=!isTip, label=node), hjust=-.3, size=2)

Pull in skyline analysis for full beast2 tree

sublineage.skylines.filepath <- beast2.runs.filepath

beast2.full.skyline <- read.table(paste0(beast2.runs.filepath,beast2.full.skyline.path), sep="\t", header=T)
beast2.full.lineages <- read.table(paste0(beast2.runs.filepath,beast2.full.lineages.path), sep="\t", header=T)

# Chris Ruis' script to extract distribution of trees supporting expansion above baseline
# In this analysis, looked for 2-fold expansion (-p 100), in which 0.6745 (67.5%) supported expansion
beast2.full.popdistro <- read.table(paste0(beast2.runs.filepath,beast2.full.popdistro.path), sep="\t", header=T)


# plot skyline
beast2.full.skyline.plot <- ggplot(beast2.full.skyline, aes(Time,Median)) + 
  geom_line() + geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
  theme_light() + 
  scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + 
  scale_y_log10() + 
  coord_cartesian(x=c(1740,2020),y=c(30,3000)) + 
  theme.text.size + labs(y="Relative genetic\ndiversity", x="Year") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.y=element_text(color = "grey25",angle=0, size=10))


# Plot 'lineages through time'
beast2.full.lineages.plot <- ggplot(beast2.full.lineages, aes(Time,Median)) + 
  geom_line() + geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
  theme_light() + 
  scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + 
  scale_y_log10() + 
  coord_cartesian(x=c(1740,2020),y=c(1,1000)) + 
  theme.text.size + labs(y="Inferred Lineages", x="Year") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.y=element_text(color = "grey25",angle=0, size=10))


# Plot population expansion date distro
beast2.full.popdistros.starts <- ggplot(beast2.full.popdistro[beast2.full.popdistro$Increase_date>0,], aes("Full Tree",Increase_date)) +
  geom_sina(alpha=0.75,color="grey80", size=1) + 
  geom_boxplot(alpha=0.0,outlier.shape = NA, width=0.25) +
  coord_cartesian(ylim=c(1740,2020)) +
  scale_y_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01), limits = c(1740,2020)) + 
  theme_light() + 
  coord_flip() + theme.text.size +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.y=element_text(color = "grey25",angle=0, size=10)) +
  theme(axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
  labs(y="Year", x="Distribution density\n(start of 2-fold\npopulation expansion)") + 
  geom_hline(yintercept = 2000, color='blue', alpha=0.5) +
  NULL
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
# alternate plot type
beast2.full.popdistros.starts.densiplot <- ggplot(beast2.full.popdistro[beast2.full.popdistro$Increase_date>0,], aes(x=Increase_date)) + 
  geom_density() +
  coord_cartesian(xlim=c(1740,2020)) +
  #scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01), limits=c(1740,2020)) + 
  scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + 
  theme_light() + 
  theme.text.size + 
  labs(x="Year", y="Distribution density\n(start of 2-fold\npopulation expansion)") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) +
  NULL
  



#plot_grid(beast2.full.skyline.plot,beast2.full.lineages.plot, ncol=1, align=T)
#plot_grid(beast2.full.skyline.plot + x.theme.strip, beast2.full.lineages.plot + x.theme.strip, beast2.full.popdistros.starts, ncol=1, align=T)

#plots.beast2.full.skyline.distro.combi <- plot_grid(beast2.full.skyline.plot + x.theme.strip, beast2.full.lineages.plot + x.theme.strip, beast2.full.popdistros.starts.densiplot, ncol=1, align='v',axis='t', labels = c('B','C','D'), label_size = 11,  label_x=0, label_y=1, scale=0.95)

plots.beast2.full.skyline.distro.combi <- plot_grid(beast2.full.skyline.plot + x.theme.strip, beast2.full.popdistros.starts.densiplot, ncol=1, align='v',axis='t', labels = c('B','C'), label_size = 11,  label_x=0, label_y=1, scale=0.95)


plots.beast2.full.skyline.distro.combi

Make combined plot (beast2 full + skylines)

#arrange.plots.beast2.full.skyline.distro.combi <- plot_grid('',plots.beast2.full.skyline.distro.combi, '', ncol=1, rel_heights=c(1,4,2))
arrange.plots.beast2.full.skyline.distro.combi <- plot_grid('',plots.beast2.full.skyline.distro.combi, '', ncol=1, rel_heights=c(1,4,4))


# Plot full BEAST tree with metadata strips
plot.full.beast2.with.skyline.distros <- plot_grid(full.beast2.tree.plot1.meta,arrange.plots.beast2.full.skyline.distro.combi,ncol=2, rel_widths=c(4,2), labels=c('A',''), label_size=11)

# Plot full BEAST tree without metadata
#plot.full.beast2.with.skyline.distros <- plot_grid(full.beast2.tree.plot1 + geom_vline(xintercept = 2000, color='blue', alpha=0.5) + annotate("rect",xmin=2000,xmax=2020,ymin=-4,ymax=521,alpha=0.1, fill='blue'), arrange.plots.beast2.full.skyline.distro.combi,ncol=2, rel_widths=c(4,2), labels=c('A',''), label_size=11)


#Cairo::Cairo(file=paste0(Figure_output_directory, "Figure3_Full-beast2+_skyline-+pop-expansion__02-2021.svg"), width = 1500, height = 1000,type="svg",units = "pt")
plot.full.beast2.with.skyline.distros

#dev.off()

Look at explicit support for population decline and expansion within the Beast2 tree distribution

Extract trees supporting a population 2-fold decline within a defined window (1990-2005) using Chris Ruis’ script:
python3 /nfs/users/nfs_m/mb29/scripts/population_change_support_BEAST.py -t TPA-uber_beast2_strict-skyline-500M_10pop_combined.trees -l TPA-uber_beast2_strict-skyline-500M_10pop_combined.log -p 50 -w 1990 2005 --decrease -d 2019.5 -b 2 -o beast2_strict-skyline-500M_10pop_pop-decline.1990-2005.p50

Analysis shows 82.8% of trees (11191/ ) support a population decline between 1990-2005.

Extract trees supporting a 2-fold population increase within a defined window (2000-2015) using Chris Ruis’ script:
python3 /nfs/users/nfs_m/mb29/scripts/population_change_support_BEAST.py -t TPA-uber_beast2_strict-skyline-500M_10pop_combined.trees -l TPA-uber_beast2_strict-skyline-500M_10pop_combined.log -p 100 -w 2000 2015 -d 2019.5 -b 2

Analysis shows 83.3% of trees (11245/ ) support a population increase between 2000-2015.

To be consistent with the ‘decline’ plot, extracted trees supporting a 2-fold population increase within a defined window (1990-2015) using Chris Ruis’ script:
python3 /nfs/users/nfs_m/mb29/scripts/population_change_support_BEAST.py -t TPA-uber_beast2_strict-skyline-500M_10pop_combined.trees -l TPA-uber_beast2_strict-skyline-500M_10pop_combined.log -p 100 -w 1990 2015 -d 2019.5 -b 2

Analysis shows 58.5% of trees (11245/ ) support a population increase between 1990-2015.


Plot distributions

beast2.pop.decline <- read.csv(beast2.pop.decline.file, header=T, stringsAsFactors=F)
beast2.pop.increase <- read.csv(beast2.pop.increase.file, header=T, stringsAsFactors=F)

beast2.pop.decline$query <- "Decline"
beast2.pop.increase$query <- "Expansion"
beast2.pop.decline.increase <- rbind(beast2.pop.decline,beast2.pop.increase)

# Create dataframe with supporting values for text plotting
#decline.support.value <- 82.8
#increase.support.value <- 83.3
decline.support.value <- "90.7"
increase.support.value <- "59.0"



beast.decline.increase.support.values <- data.frame(query=c("Decline","Expansion"),proportion=c(decline.support.value,increase.support.value), median.date=c(median(beast2.pop.decline$Date_of_change), median(beast2.pop.increase$Date_of_change)), stringsAsFactors=F)


p.beast2.full.popdecline.increase.densiplot <- ggplot(beast2.pop.decline.increase, aes(x=Date_of_change, group=query, color=query, fill=query)) + 
  geom_density(alpha=0.25) +
  scale_x_continuous(expand=c(0.01,0.01)) + 
  theme_light() + 
  theme.text.size + 
  labs(x="Year", y="Posterior distribution density\n(start of 2-fold population decline/expansion)", color="Key") + 
  guides(fill=FALSE) +
  geom_text(data=beast.decline.increase.support.values, aes(x=median.date, y=0.45, label=paste0(proportion,"% of supporting trees")),size=3) + 
  geom_text(data=beast.decline.increase.support.values, aes(x=median.date, y=0.5, label=paste0("median date: ",round(median.date,1))),size=3) + 
  theme(legend.position = 'top') 
  #geom_vline(data=beast.decline.increase.support.values, aes(xintercept=median.date, color=query)) +
  NULL
NULL
#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure8__Skyline-pop-decline+expansion-dates__02-2021.svg"), width = 450, height = 300,type="svg",units = "pt")  
p.beast2.full.popdecline.increase.densiplot

#dev.off()

Look at trees supporting or not-supporting expansion

pop.decline.supporting.trees <- read.nexus(pop.decline.supporting.trees.file)
pop.decline.supporting.trees.sample25 <-sample(pop.decline.supporting.trees,size=25)

pop.decline.notsupporting.trees <- read.nexus(pop.decline.notsupporting.trees.file)
pop.decline.notsupporting.trees.sample25 <-sample(pop.decline.notsupporting.trees,size=25)

TPA.meta1.2.pinecone$Sample_Name.dates <- paste0(TPA.meta1.2.pinecone$Sample_Name,"|",TPA.meta1.2.pinecone$Sample_Year)


p.pop.decline.notsupporting.trees.sample25 <- ggdensitree(pop.decline.notsupporting.trees.sample25, alpha=0.1,mrsd="2019-06-01") + 
  theme_tree2()
Ignoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsd
p.pop.decline.notsupporting.trees.sample25 <- p.pop.decline.notsupporting.trees.sample25 %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name.dates, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage) + 
  geom_tippoint(aes(color=Sublineage), size=1, alpha=0.5) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  # Add date lines for easy interpretation  
  scale_x_continuous(breaks=c(1300,1400,1500,1600,1700,1800,1850,1900,1925,1950,1975,2000,2020), minor_breaks=seq(1950, 2020, 5)) +
  theme(panel.grid.major   = element_line(color="grey50", size=.2),
        panel.grid.minor   = element_line(color="grey85", size=.2),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) + 
  ggtitle("Trees not supporting population expansion") +
  theme(plot.title = element_text(size = 9))
#p.pop.decline.notsupporting.trees.sample25


p.pop.decline.supporting.trees.sample25 <- ggdensitree(pop.decline.supporting.trees.sample25, alpha=0.1,mrsd="2019-06-01") + 
  theme_tree2()
Ignoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsdIgnoring unknown parameters: mrsd
p.pop.decline.supporting.trees.sample25 <- p.pop.decline.supporting.trees.sample25 %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name.dates, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage) + 
  geom_tippoint(aes(color=Sublineage), size=1, alpha=0.5) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  # Add date lines for easy interpretation  
  scale_x_continuous(breaks=c(1300,1400,1500,1600,1700,1800,1850,1900,1925,1950,1975,2000,2020), minor_breaks=seq(1950, 2020, 5)) +
  theme(panel.grid.major   = element_line(color="grey50", size=.2),
        panel.grid.minor   = element_line(color="grey85", size=.2),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) +
  ggtitle("Trees supporting population expansion") +
  theme(plot.title = element_text(size = 9))
#p.pop.decline.supporting.trees.sample25



plot_grid(p.pop.decline.supporting.trees.sample25, p.pop.decline.notsupporting.trees.sample25, ncol=1)

There are no obvious differences in tree topology here. It is likely that these trees either show expansion <2-fold, or the baseline used to average is affecting expansion - e.g. if the baseline was higher at the start of the decline, expansion would not be detected.


Extract individual sublineages for Bayesian Skyline analysis (only take sublineages of decent size, with no outgroups)

major.multicountry.sublineages <- sublineage.counts[sublineage.counts$Count>10,]
major.multicountry.sublineages
sublineage.1.meta <- TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage==1,c("Cleaned_fastq_id","Sample_Name","Sample_Year")]
sublineage.2.meta <- TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage==2,c("Cleaned_fastq_id","Sample_Name","Sample_Year")]
#sublineage.4.meta <- TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage==4,c("Cleaned_fastq_id","Sample_Name","Sample_Year")]
sublineage.8.meta <- TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage==8,c("Cleaned_fastq_id","Sample_Name","Sample_Year")]
sublineage.14.meta <- TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage==14,c("Cleaned_fastq_id","Sample_Name","Sample_Year")]

all.pinecone.meta <- TPA.meta1.2.pinecone.havedates[,c("Cleaned_fastq_id","Sample_Name","Sample_Year")]

Check temporal signal by extracting subtrees


p.sublin.1.r2t <- plot.subtree.R2T(TPA.MLtree,sublineage.1.meta$Sample_Name,TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])
p.sublin.2.r2t <- plot.subtree.R2T(TPA.MLtree,sublineage.2.meta$Sample_Name,TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])
p.sublin.8.r2t <- plot.subtree.R2T(TPA.MLtree,sublineage.8.meta$Sample_Name,TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])
p.sublin.14.r2t <- plot.subtree.R2T(TPA.MLtree,sublineage.14.meta$Sample_Name,TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])



#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure10_Sublineages_trees+root2tip__02-2021.svg"), width = 1000, height = 1000,type="svg",units = "pt")
plot_grid(p.sublin.1.r2t,p.sublin.2.r2t,p.sublin.8.r2t,p.sublin.14.r2t,ncol=2, labels=c('Sublineage 1','Sublineage 2','Sublineage 8','Sublineage 14'),scale = 0.85,label_size=12)

#dev.off()

Evaluate subtree skyline analyses

Sublineage 1 (new)
Seqs 365, sites 278, span 1981-2019
- Good convergence in all traces, strong ESS for most variables, but lower for a few of the skyline groups (ESS>100, apart from Skyline.Groupsize10 with ESS 79)
- clock rate 1.34e-7

Sublineage 2 (new)
Seqs 32, sites 36, span 2000-2019
- Good convergence in all traces, very strong ESS in all cases
- Skyline strong expanding skyline plot
- clock rate 1.57e-7

Sublineage 8 (new)
Seqs 15, sites 31, span 1986-2019
- Good convergence in all traces, very strong ESS in all cases
- Skyline - good skyline, showing possible expansion during 90s, possible contraction in 2010s)
- clock rate 1.80e-7

Sublineage 14 (new)
Seqs 55, sites 23, span 2013-2019
- did not converge (at all)
- No temporal signal (in r2t plots)




Define some functions to plot



plot.skyline.data <- function(skyline.file,lineages.file){  
  skyline.data <- read.table(paste0(sublineage.skylines.filepath,skyline.file), header=T, sep="\t", comment.char="",stringsAsFactors=F)
  skyline.plot <- ggplot(skyline.data, aes(Time,Median)) + 
    geom_line() + geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
    theme_light() + 
    #scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + 
    scale_x_continuous(breaks=c(seq(1980,2020,20)), expand=c(0.01,0.01)) + 
    scale_y_log10() + 
    coord_cartesian(x=c(1980,2020)) + 
    theme.text.size + labs(y="Relative genetic diversity", x="Year") + 
    geom_vline(xintercept = 2000, color='blue', alpha=0.5) + annotate("rect",xmin=2000,xmax=2020,ymin=0,ymax=10000,alpha=0.1, fill='blue')
  
  # Lineages
  skyline.lineages <- read.table(paste0(sublineage.skylines.filepath,lineages.file), header=T, sep="\t", comment.char="",stringsAsFactors=F)
  lineages.plot  <- ggplot(skyline.lineages, aes(Time,Median)) + 
    geom_line() + geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
    theme_light() + 
    scale_x_continuous(breaks=c(seq(1980,2020,20)), expand=c(0.01,0.01)) + 
    #scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + 
    scale_y_log10() + 
    coord_cartesian(x=c(1980,2020), y=c(1,300)) + 
    #scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + 
    theme.text.size + labs(y="Lineages", x="Year") + 
    geom_vline(xintercept = 2000, color='blue', alpha=0.5) + annotate("rect",xmin=2000,xmax=2020,ymin=0,ymax=500,alpha=0.1, fill='blue')
  
  skyline.combined.plot <- plot_grid(skyline.plot,lineages.plot, ncol=1, align=T)
  return(skyline.combined.plot)
  #return(skyline.plot)
}

Plot sublineage skylines with lineages through time - all together


sublineage.skylines.filepath <- "/Users/mb29/Treponema/Expanded_Global_Sequencing/Analysis/Global_TPA_uber-analysis_2020_v2/phylo/good_cov_dataset/gubbins_2.4.1__20-11-10/beast/individual_sublineages_reclassified__2021-02-04/beauti/outputs/"
  
collected.skylines <- NULL
for (sublineage in c(1,2,8)) {
  sublineage.skyline <- read.table(paste0(sublineage.skylines.filepath,"TPA-uber.remasked.2020-11-10.goodcov25.gubbins.WGS.sublineage.new.",sublineage,".noinv.Strict-Skyline_combined.skyline-data.tsv"), header=T, sep="\t", comment.char="",stringsAsFactors=F)
  sublineage.skyline$sublineage <- sublineage 
  collected.skylines <- rbind(collected.skylines, sublineage.skyline)
}
beast2.full.skyline$sublineage <- "All"
collected.skylines.inc.full <- rbind(collected.skylines, beast2.full.skyline)
collected.skylines.inc.full$sublineage <- factor(collected.skylines.inc.full$sublineage, levels=unique(collected.skylines.inc.full$sublineage))

sublineage.skyline.combi.plot <- ggplot(collected.skylines.inc.full, aes(Time,Median)) + 
  geom_line() + geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
  theme_light() + 
  #scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + 
  scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + 
  scale_y_log10() + 
  coord_cartesian(x=c(1900,2020),y=c(1,3000)) + 
  theme.text.size + labs(y="Relative genetic diversity (per sublineage)", x="Year") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) + #annotate("rect",xmin=2000,xmax=2020,ymin=0,ymax=10000,alpha=0.1, fill='blue') + 
  facet_grid(sublineage~.) +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.y=element_text(color = "grey25",angle=0, size=10))


# Do same for 'lineages through time'
collected.skylines.lineages <- NULL
for (sublineage in c(1,2,8)) {
  sublineage.skyline.lineages <- read.table(paste0(sublineage.skylines.filepath,"TPA-uber.remasked.2020-11-10.goodcov25.gubbins.WGS.sublineage.new.",sublineage,".noinv.Strict-Skyline_combined.lineages-data.tsv"), header=T, sep="\t", comment.char="",stringsAsFactors=F)
  sublineage.skyline.lineages$sublineage <- sublineage 
  collected.skylines.lineages <- rbind(collected.skylines.lineages, sublineage.skyline.lineages)
}
beast2.full.lineages$sublineage <- "All"
collected.skylines.lineages.inc.full <- rbind(collected.skylines.lineages, beast2.full.lineages)
collected.skylines.lineages.inc.full$sublineage <- factor(collected.skylines.lineages.inc.full$sublineage, levels=unique(collected.skylines.lineages.inc.full$sublineage))


sublineage.skyline.lineages.combi.plot <- ggplot(collected.skylines.lineages.inc.full, aes(Time,Median)) + 
  geom_line() + geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
  theme_light() + 
  scale_x_continuous(breaks=c(seq(1980,2020,10)), expand=c(0.01,0.01)) + 
  scale_y_log10() + 
  coord_cartesian(x=c(1980,2020)) + 
  theme.text.size + labs(y="Inferred Lineages (per sublineage)", x="Year") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) + #annotate("rect",xmin=2000,xmax=2020,ymin=0,ymax=10000,alpha=0.1, fill='blue') + 
  facet_grid(sublineage~.) +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.y=element_text(color = "grey25",angle=0, size=10))


#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure9_sublineages_skylines__02-2021.svg"), width = 300, height = 400,type="svg",units = "pt")
sublineage.skyline.combi.plot

#dev.off()

Look at Chris Ruis’s tool for extracting the date distributino of a population size increase
Run as:
python3 ~/scripts/population_increase_distribution_BEAST.py -b1 -t TPA-uber.remasked.2020-11-10.goodcov25.gubbins.WGS.sublineage.12.Strict-Skyline_combined.trees -l TPA-uber.remasked.2020-11-10.goodcov25.gubbins.WGS.sublineage.12.Strict-Skyline_combined.log -d 2019.5 -o TPA-uber.remasked.2020-11-10.goodcov25.gubbins.WGS.sublineage.12.Strict-Skyline_combined.pop-distributions.txt

All pop distros

# "TPA-uber_beast2_strict-skyline-500M_10pop_combined.pop-distributions_p100.txt"
pop.distro.subsampled.all.file <- beast2.full.popdistro.path 


# Do same for 'lineages through time'
collected.skyline.popdistros <- NULL
for (sublineage in c(1,2,8)) {
  skyline.popdistro <- read.table(paste0(pop.distro.path,"TPA-uber.remasked.2020-11-10.goodcov25.gubbins.WGS.sublineage.new.",sublineage,".noinv.Strict-Skyline_combined.pop-expansion"), header=T, sep="\t", comment.char="",stringsAsFactors=F)
  skyline.popdistro$sublineage <- sublineage 
  collected.skyline.popdistros <- rbind(collected.skyline.popdistros, skyline.popdistro)
}
# do 'All' separately
sublineage.all.skyline.pop.distro <- read.table(paste0(pop.distro.path,pop.distro.subsampled.all.file), header=T, sep="\t", comment.char="",stringsAsFactors=F)
sublineage.all.skyline.pop.distro$sublineage <- "All"
collected.skyline.popdistros <- rbind(collected.skyline.popdistros,sublineage.all.skyline.pop.distro)


collected.skyline.popdistros$sublineage <- factor(collected.skyline.popdistros$sublineage, levels=unique(collected.skyline.popdistros$sublineage))

x.theme.strip.partial <- theme(axis.text.x = element_blank(), axis.ticks.x= element_blank())


plot.collected.skyline.popdistros.starts <- ggplot(collected.skyline.popdistros[collected.skyline.popdistros$Increase_date>0,], aes(sublineage,Increase_date)) +
  geom_sina(alpha=0.2,color="grey80", size=1) + 
  geom_boxplot(alpha=0.0,outlier.shape = NA, width=0.25) +
  coord_cartesian(ylim=c(1900,2020)) + 
  theme_light() + 
  ylim(1940,2020) +
  facet_grid(sublineage~., scales="free_y") +
  coord_flip() + theme.text.size +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.y=element_text(color = "grey25",angle=0, size=10)) +
  #y.theme.strip +
  theme(axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
  labs(y="Year", x="Distribution density (start date for 5-fold population expansion)") + 
  geom_hline(yintercept = 2000, color='blue', alpha=0.5) +
  NULL
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
#plot.collected.skyline.popdistros.starts


plot.collected.skyline.popdistros.starts.density <- ggplot(collected.skyline.popdistros[collected.skyline.popdistros$Increase_date>0,], aes(Increase_date)) + 
  geom_density() +
  coord_cartesian(xlim=c(1900,2020)) +
  #scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01), limits=c(1740,2020)) + 
  scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + 
  facet_grid(sublineage~., scales="free_y") +
  #facet_grid(sublineage~.) +
  theme_light() + 
  theme.text.size + 
  #theme(axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.y=element_text(color = "grey25",angle=0, size=10)) +
  labs(x="Year", y="Distribution density (start of 2-fold population expansion)") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) +
  NULL
plot.collected.skyline.popdistros.starts.density


plot_grid(sublineage.skyline.combi.plot,  sublineage.skyline.lineages.combi.plot, plot.collected.skyline.popdistros.starts.density,ncol=3, align=T, axis="ltb",rel_widths=c(4,2,3), labels=c('A - Genetic Diversity','B - Lineages','C - Population Expansion'), label_size=11,label_y=1, label_x=0.01, scale=0.95) 


#Want to look at distributions within single countries

Extract UK and Canada (all) subtrees


subtree.UK.meta <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Geo_Country=="UK"),c("Cleaned_fastq_id","Sample_Name","Sample_Year")]
p.subtree.UK.r2t <- plot.subtree.R2T(TPA.MLtree,subtree.UK.meta$Sample_Name,TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])
`geom_smooth()` using formula 'y ~ x'
subtree.BC.meta <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Geo_Region=="British_Columbia"),c("Cleaned_fastq_id","Sample_Name","Sample_Year")]
p.subtree.BC.r2t <- plot.subtree.R2T(TPA.MLtree,subtree.BC.meta$Sample_Name,TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])
`geom_smooth()` using formula 'y ~ x'
plot_grid(p.subtree.UK.r2t, p.subtree.BC.r2t, nrow=2)

Extract UK and Canada subtrees for Sublineage 2 only


sublineage.2.UK.meta <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$TPA.pinecone.sublineage==2 & TPA.meta1.2.pinecone$Geo_Country=="UK"),c("Cleaned_fastq_id","Sample_Name","Sample_Year")]
p.sublin.2.UK.r2t <- plot.subtree.R2T(TPA.MLtree,sublineage.2.UK.meta$Sample_Name,TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])
`geom_smooth()` using formula 'y ~ x'
sublineage.2.BC.meta <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$TPA.pinecone.sublineage==2 & TPA.meta1.2.pinecone$Geo_Region=="British_Columbia"),c("Cleaned_fastq_id","Sample_Name","Sample_Year")]
p.sublin.2.BC.r2t <- plot.subtree.R2T(TPA.MLtree,sublineage.2.BC.meta$Sample_Name,TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])
`geom_smooth()` using formula 'y ~ x'
plot_grid(p.sublin.2.UK.r2t, p.sublin.2.BC.r2t, nrow=2)

So very little temporal signal here, particularly for the UK data (only 7 years). Not realistic to analyse this way.

Look at coverage statistics

# For the large low cov dataset
nrow(TPA.meta1.2)
[1] 726
mean(as.numeric(TPA.meta1.2$Mean_mapping_coverage), na.rm=T)
[1] 84.41708
max(as.numeric(TPA.meta1.2$Mean_mapping_coverage))
[1] 727.8
min(as.numeric(TPA.meta1.2$Mean_mapping_coverage))
[1] 3.1
(1-mean(as.numeric(TPA.meta1.2$`Proportion-N_>5_mapping+masking_Nichols`), na.rm=T))*100
[1] 82.088
(1-max(as.numeric(TPA.meta1.2$`Proportion-N_>5_mapping+masking_Nichols`), na.rm=T))*100
[1] 25.0359
(1-min(as.numeric(TPA.meta1.2$`Proportion-N_>5_mapping+masking_Nichols`), na.rm=T))*100
[1] 96.92742
# For the curated dataset
nrow(TPA.meta1.2.pinecone)
[1] 528
mean(as.numeric(TPA.meta1.2.pinecone$Mean_mapping_coverage), na.rm=T)
[1] 111.0958
max(as.numeric(TPA.meta1.2.pinecone$Mean_mapping_coverage))
[1] 727.8
min(as.numeric(TPA.meta1.2.pinecone$Mean_mapping_coverage))
[1] 11.1
(1-mean(as.numeric(TPA.meta1.2.pinecone$`Proportion-N_>5_mapping+masking_Nichols`), na.rm=T))*100
[1] 92.89512
(1-max(as.numeric(TPA.meta1.2.pinecone$`Proportion-N_>5_mapping+masking_Nichols`), na.rm=T))*100
[1] 75.0879
(1-min(as.numeric(TPA.meta1.2.pinecone$`Proportion-N_>5_mapping+masking_Nichols`), na.rm=T))*100
[1] 96.92742

Quickly look at lineage association of recombination blocks

# Define and read in recombination info
recombination_event.data <- readxl::read_excel(recombination_event.file,sheet="Recombination_Data")

# Subset to gubbins filtered regions and order by event
recombination_event.data.gubbins <- recombination_event.data[recombination_event.data$Gubbins_Event_ID!="-",]
recombination_event.data.gubbins <- recombination_event.data.gubbins[!is.na(recombination_event.data.gubbins$line_order),]
recombination_event.data.gubbins <- recombination_event.data.gubbins[order(as.numeric(recombination_event.data.gubbins$Gubbins_Event_ID)),]

# convert per-event list into a matrix
recombination_event.data.gubbins.matrix <- reshape2::dcast(reshape2::melt(strsplit(recombination_event.data.gubbins$SampleIDs,",")),L1~value)
colnames(recombination_event.data.gubbins.matrix)[1] <- "Gubbins_Event_ID"

# melt matrix into longform
recombination_event.data.gubbins.matrix.melt <- reshape2::melt(recombination_event.data.gubbins.matrix,id.vars="Gubbins_Event_ID")
# infer presence/absence
recombination_event.data.gubbins.matrix.melt$binary <- ifelse(is.na(recombination_event.data.gubbins.matrix.melt$value),0,1)
colnames(recombination_event.data.gubbins.matrix.melt) <- c("Gubbins_Event_ID","Sample_Name","value","recomb.present")

# Make a binary presence/absence matrix organised by sample (for possible plotting with ggtree)
recombination_event.data.gubbins.matrix.binary <- reshape2::dcast(recombination_event.data.gubbins.matrix.melt[,c("Gubbins_Event_ID","Sample_Name","recomb.present")], Sample_Name~Gubbins_Event_ID)
Using recomb.present as value column: use value.var to override.
# Merge in Lineage and Pinecone information
recombination_event.data.gubbins.matrix.melt <- plyr::join(recombination_event.data.gubbins.matrix.melt[c("Gubbins_Event_ID","Sample_Name","recomb.present")],TPA.meta1.2.pinecone[,c("Sample_Name","TPA.pinecone.sublineage","TPA_Lineage")], by="Sample_Name")


recombination_event.by.Lineage <- recombination_event.data.gubbins.matrix.melt %>% dplyr::group_by(Gubbins_Event_ID, TPA_Lineage) %>% 
  dplyr::summarise(Sum=sum(recomb.present))
`summarise()` regrouping output by 'Gubbins_Event_ID' (override with `.groups` argument)
#recombination_event.by.Lineage[recombination_event.by.Lineage$Sum!=0,]

recombination_event.by.sublineage <- recombination_event.data.gubbins.matrix.melt %>% dplyr::group_by(Gubbins_Event_ID, TPA.pinecone.sublineage) %>% 
  dplyr::summarise(Sum=sum(recomb.present))
`summarise()` regrouping output by 'Gubbins_Event_ID' (override with `.groups` argument)
recombination_event.by.sublineage[recombination_event.by.sublineage$Sum!=0,]
NA
NA
recombination_event.data.gubbins.matrix.melt <- plyr::join(recombination_event.data.gubbins.matrix.melt, recombination_event.data.gubbins[,c("Gubbins_Event_ID","Block_Start","Block_End")], by="Gubbins_Event_ID")

recombiplot.tip.order <- data.frame(Sample_Name=get_taxa_name(ggtree(TPA.MLtree)),stringsAsFactors = F) 
recombiplot.tip.order$order <- c(1:nrow(recombiplot.tip.order))
recombiplot.tip.order$order2 <- rev(c(1:nrow(recombiplot.tip.order)))
recombiplot.tip.order <- plyr::join(recombiplot.tip.order,recombination_event.data.gubbins.matrix.melt, by="Sample_Name", type="full")


p.recombi.plot <- ggplot(recombiplot.tip.order) +
  geom_rect(aes(ymin=order2-0.5,ymax=order2+0.5, xmin=as.numeric(Block_Start), xmax=as.numeric(Block_End)), alpha=0.5) +
  coord_cartesian(xlim=c(1,1139569)) +
  theme_minimal()

#TPA.MLtree.ggtree.tippoint


#ggtree(TPA.MLtree) + theme(legend.position="none")
plot_grid(ggtree(TPA.MLtree), p.recombi.plot, align=T, axis="tb", rel_widths = c(1,2))
Removed 73 rows containing missing values (geom_rect).


TPA.MLtree.testplot <- ggtree(TPA.MLtree)
#facet_plot(TPA.MLtree.testplot, panel="Genome_Blocks", data=recombiplot.tip.order, geom_rect, mapping=aes(xmin=Block_Start, xmax=Block_End, fill=TPA_Lineage))
recombining.genes <- c("TPASS_RS00040", "TPASS_RS00045", "TPASS_RS00590", "TPASS_RS00675", "TPASS_RS01555", "TPASS_RS01565", "TPASS_RS01570", "TPASS_RS02125", "TPASS_RS02290", "TPASS_RS02700", "TPASS_RS03020", "TPASS_RS03070", "TPASS_RS03075", "TPASS_RS04240", "TPASS_RS04245", "TPASS_RS04250", "TPASS_RS04275", "TPASS_RS05385", "TPASS_RS05110", "TPASS_RS05210,TPASS_RS00685,TPASS_RS00705,TPASS_RS00690,TPASS_RS00680,TPASS_RS00700,TPASS_RS00675,TPASS_RS00665,TPASS_RS00670", "TPASS_RS00705,TPASS_RS00685,TPASS_RS00690,TPASS_RS00700", "TPASS_RS00705", "TPASS_RS00705", "TPASS_RS00705", "TPASS_RS00905", "TPASS_RS01600", "TPASS_RS01600", "TPASS_RS02265", "TPASS_RS02265", "TPASS_RS02265", "TPASS_RS02350,TPASS_RS02355", "TPASS_RS02375", "TPASS_RS02525", "TPASS_RS02760,TPASS_RS02755", "TPASS_RS03055,TPASS_RS03065,TPASS_RS03060", "TPASS_RS03065", "TPASS_RS04780,TPASS_RS04790,TPASS_RS04785,TPASS_RS04775", "TPASS_RS05100,TPASS_RS05105")


recombining.genes <- c("TPASS_RS05210,TPASS_RS00685,TPASS_RS00705,TPASS_RS00690,TPASS_RS00680,TPASS_RS00700,TPASS_RS00675,TPASS_RS00665,TPASS_RS00670", "TPASS_RS00705,TPASS_RS00685,TPASS_RS00690,TPASS_RS00700", "TPASS_RS00705", "TPASS_RS00705", "TPASS_RS00705", "TPASS_RS00905", "TPASS_RS01600", "TPASS_RS01600", "TPASS_RS02265", "TPASS_RS02265", "TPASS_RS02265", "TPASS_RS02350,TPASS_RS02355", "TPASS_RS02375", "TPASS_RS02525", "TPASS_RS02760,TPASS_RS02755", "TPASS_RS03055,TPASS_RS03065,TPASS_RS03060", "TPASS_RS03065", "TPASS_RS04780,TPASS_RS04790,TPASS_RS04785,TPASS_RS04775", "TPASS_RS05100,TPASS_RS05105")

unique(unlist(strsplit(recombining.genes,",")))
 [1] "TPASS_RS05210" "TPASS_RS00685" "TPASS_RS00705" "TPASS_RS00690" "TPASS_RS00680" "TPASS_RS00700" "TPASS_RS00675" "TPASS_RS00665"
 [9] "TPASS_RS00670" "TPASS_RS00905" "TPASS_RS01600" "TPASS_RS02265" "TPASS_RS02350" "TPASS_RS02355" "TPASS_RS02375" "TPASS_RS02525"
[17] "TPASS_RS02760" "TPASS_RS02755" "TPASS_RS03055" "TPASS_RS03065" "TPASS_RS03060" "TPASS_RS04780" "TPASS_RS04790" "TPASS_RS04785"
[25] "TPASS_RS04775" "TPASS_RS05100" "TPASS_RS05105"

Tip date randomisation using TipDatingBEAST package

library(TipDatingBeast)

# setwd("/Users/mb29/Treponema/Expanded_Global_Sequencing/Analysis/Global_TPA_uber-analysis_2020_v2/phylo/good_cov_dataset/gubbins_2.4.1__20-11-10/beast/beast2_full-tree_2020-11-25/beauti/tip_randomisation/")

#setwd("/Users/mb29/Treponema/Expanded_Global_Sequencing/Analysis/Global_TPA_uber-analysis_2020_v2/phylo/good_cov_dataset/gubbins_2.4.1__20-11-10/beast/beast2_full-tree_2020-11-25/beauti/tip_randomisation/rerun_MEL-1_01-03-2021/")

original.xml.file <- "beast2_Strict-Skyline_Full500M_+sites_1"

# Generate tipdate-randomised xml files from original BEAST2 xml
#TipDatingBeast::RandomDates(name="beast2_Strict-Skyline_Full500M_+sites_1",reps=20)

# All datasets run in BEAST2. 

# After 20 runs (took ~2 weeks) ensure all files are correctly labelled and pull in data
#TipDatingBeast::PlotDRT(name="beast2_Strict-Skyline_Full500M_+sites_1",reps=20,burnin=0.1)

# That plot isn't very nice, but the package also generates a csv file containing the values of interest that we can plot

randomtip.summary <- read.csv(random.tip.summary.file,header=T)

randomtip.summary$Data <- ifelse(randomtip.summary$calibr==0,"Real","Randomised")

p.tipdaterandomisation.normal <- ggplot(randomtip.summary) +
  geom_pointrange(aes(x=calibr, y=median,ymin = lowerHPD, ymax = HigherHPD,color=Data)) +
  theme_light() + theme(legend.position="top") +
  labs(x="Replicate", y="Clock Rate") +
  geom_hline(yintercept=randomtip.summary$lowerHPD[1],alpha=0.5) +
  geom_hline(yintercept=randomtip.summary$HigherHPD[1],alpha=0.5) +
  geom_hline(yintercept=randomtip.summary$HigherHPD[18],alpha=0.5) +
  NULL
#p.tipdaterandomisation.normal

p.tipdaterandomisation.log10 <- ggplot(randomtip.summary) +
  geom_pointrange(aes(x=calibr, y=median,ymin = lowerHPD, ymax = HigherHPD,color=Data)) +
  theme_light() + 
  theme(legend.position="top") +
  #scale_y_log10() +
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
              labels = trans_format("log10", math_format(10^.x))) +
  labs(x="Replicate", y="Clock Rate") +
  geom_hline(yintercept=randomtip.summary$lowerHPD[1],alpha=0.5) +
  geom_hline(yintercept=randomtip.summary$HigherHPD[1],alpha=0.5) +
  NULL

#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure19_tipdate-randomisation.svg"), width = 550, height = 300,type="svg",units = "pt")
p.tipdaterandomisation.log10

#dev.off()

randomtip.summary[randomtip.summary$calibr==0,"median"]
[1] 1.274125e-07
max(randomtip.summary[randomtip.summary$calibr!=0,"median"])
[1] 8.016449e-09

Distribution of Rabbit passaged strains


sublineage.passaged.samples <- data.frame(TPA.meta1.2.pinecone %>% dplyr::group_by(TPA.pinecone.sublineage,Direct_from_clin) %>%
  dplyr::summarise(count=n()),stringsAsFactors = F)
`summarise()` regrouping output by 'TPA.pinecone.sublineage' (override with `.groups` argument)
sublineage.passaged.samples$TPA.pinecone.sublineage <- factor(sublineage.passaged.samples$TPA.pinecone.sublineage, levels=rev(sublineages.cols.brew$sublineage))

p.sublineage.rabbit.passage <- ggplot(sublineage.passaged.samples, aes(count, TPA.pinecone.sublineage, fill=Direct_from_clin)) +
  geom_barh(stat="identity",position="fill", width=0.75) +
  theme_light() +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='bottom') +
  scale_fill_manual(name="Direct from Clinical sample\n(no rabbit passage)",values=c("grey80","grey10")) +
  labs(y="Sublineage", x="Proportion passaged")

#p.sublineage.rabbit.passage


#data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Direct_Sequenced=TPA.meta1.2.pinecone$Direct_from_clin, stringsAsFactors=F)

p.MLtree.rabbit.passage.distros <- gheatmap(TPA.MLtree.ggtree.tippoint + theme(legend.position="none"),
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, `Direct from clinical`=TPA.meta1.2.pinecone$Direct_from_clin, stringsAsFactors=F), color='grey70',width=0.075,offset=0.00000725, colnames_angle=-45,colnames_offset_y=0, hjust=0,font.size=2) + 
  scale_fill_manual(name="Direct from clinical sample",values=c("grey80","grey10")) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right')
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
p.sublineage.rabbit.passage.toprow <- plot_grid('',p.sublineage.rabbit.passage + theme(legend.position='none'),'',ncol=1,rel_heights=c(1,3,1))
Cannot convert object of class character into a grob.Cannot convert object of class character into a grob.
p.rabbit.passage.distros.combination <- plot_grid(p.MLtree.rabbit.passage.distros, p.sublineage.rabbit.passage.toprow, rel_widths = c(3,1), labels=c('A','B'), label_size=11)


#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure7__MLtree+rabbitpassage_02-2021.svg"), width = 1000, height = 800,type="svg",units = "pt")
p.rabbit.passage.distros.combination

#dev.off()





Look at distribution of SNPs on genome



WGS.site.positions <- read.table(WGS.site.positions.file, stringsAsFactors = F, header=F)
colnames(WGS.site.positions) <- "position"

ggplot(WGS.site.positions, aes(position)) + 
  geom_density() +
  theme_light()


WGS.site.positions$SNP <- 1
WGS.site.positions.all <- plyr::join(WGS.site.positions, data.frame(position=c(1:1139569),stringsAsFactors=F), type="right", by="position")
WGS.site.positions.all[is.na(WGS.site.positions.all$SNP),"SNP"] <- 0


windowsize <- 1000
WGS.site.positions.all$window <- ((trunc(as.numeric(WGS.site.positions.all$position) / windowsize,0))*windowsize)
WGS.SNP.density.window <- WGS.site.positions.all %>% 
  group_by(window) %>% 
  dplyr::summarise(mean = mean(SNP), count=sum(SNP))
`summarise()` ungrouping output (override with `.groups` argument)
p.WGS.SNP.density <- ggplot(WGS.SNP.density.window, aes(window, count)) + 
  geom_point(alpha=0.5) +
  theme_light() + 
  labs(x=paste0("Genome Position (",windowsize," bp windows)"), y=paste0("Variable sites/",windowsize," bp")) +
  theme.text.size
p.WGS.SNP.density

NA
NA
NA
NA

Bring macrolide resistance back in

Look at macrolide resistance



TPA.global.compmapping.23s <- read.table(TPA.global.compmapping.23s.file, header=T, sep="\t", check.names = F, comment.char = "")


# Missing tree samples from 23s data
missing.23S <- TPA.MLtree$tip.label[TPA.MLtree$tip.label %notin% TPA.global.compmapping.23s$Sample] 
missing.23S.meta <- TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Sample_Name %in% missing.23S, "Cleaned_fastq_id"]

# Only keep relevant values
TPA.global.compmapping.23s <- TPA.global.compmapping.23s[TPA.global.compmapping.23s$Sample %in% TPA.MLtree$tip.label,]


TPA.global.compmapping.23s$Sample_Name <- TPA.global.compmapping.23s$Sample

# Evaluate alleles (again)
TPA.global.compmapping.23s$VariantPresent_A2058G_redo <- ifelse((TPA.global.compmapping.23s$ALT_A2058G=="G" & TPA.global.compmapping.23s$DP_A2058G>20 & TPA.global.compmapping.23s$AltPerc_A2058G>95),"Yes", ifelse(TPA.global.compmapping.23s$VariantPresent_A2058G=="Hetero","Uncertain",ifelse((TPA.global.compmapping.23s$ALT_A2058G=="G" & TPA.global.compmapping.23s$DP_A2058G<=20 & TPA.global.compmapping.23s$AltPerc_A2058G>95),"Uncertain","No")))

TPA.global.compmapping.23s$VariantPresent_A2059G_redo <- ifelse((TPA.global.compmapping.23s$ALT_A2059G=="G" & TPA.global.compmapping.23s$DP_A2059G>20 & TPA.global.compmapping.23s$AltPerc_A2059G>95),"Yes", ifelse(TPA.global.compmapping.23s$VariantPresent_A2059G=="Hetero","Uncertain",ifelse((TPA.global.compmapping.23s$ALT_A2059G=="G" & TPA.global.compmapping.23s$DP_A2059G<=20 & TPA.global.compmapping.23s$AltPerc_A2058G>95),"Uncertain","No")))


TPA.global.compmapping.23s$resistant <- ifelse((TPA.global.compmapping.23s$VariantPresent_A2058G_redo=="Hetero" | TPA.global.compmapping.23s$VariantPresent_A2059G_redo=="Uncertain"),"Uncertain",ifelse(TPA.global.compmapping.23s$VariantPresent_A2058G_redo=="Yes" & TPA.global.compmapping.23s$VariantPresent_A2059G_redo=="No", "A2058G", ifelse(TPA.global.compmapping.23s$VariantPresent_A2058G_redo=="No" & TPA.global.compmapping.23s$VariantPresent_A2059G_redo=="Yes","A2059G", ifelse(TPA.global.compmapping.23s$VariantPresent_A2058G_redo=="No" & TPA.global.compmapping.23s$VariantPresent_A2059G_redo=="No", "Sensitive","Uncertain"))))

Now plot in a nice tree


TPA.global.compmapping.23s.p <- data.frame(row.names=TPA.global.compmapping.23s$Sample, A2058G=TPA.global.compmapping.23s$VariantPresent_A2058G_redo, A2059G=TPA.global.compmapping.23s$VariantPresent_A2059G_redo)

p.MLtree.23S.distros <- gheatmap(TPA.MLtree.ggtree.tippoint + theme(legend.position="none"),
               TPA.global.compmapping.23s.p, color='grey70',width=0.075,offset=0.00000725, colnames_angle=-45,colnames_offset_y=0, hjust=0,font.size=2) + 
  #scale_fill_manual(name="Resistance\nAllele\nPresent",values=c("grey50","grey95","black"), breaks=c("Uncertain","No","Yes")) +
  scale_fill_manual(name="Resistance\nAllele\nPresent",values=c("black","grey95","grey50"), breaks=c("Yes","No","Uncertain")) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='left')

p.MLtree.23S.distros

Look at sublineage distribution

TPA.meta1.2.pinecone.23S <- plyr::join(TPA.meta1.2.pinecone,data.frame(Sample_Name=TPA.global.compmapping.23s$Sample, A2058G=TPA.global.compmapping.23s$VariantPresent_A2058G_redo, A2059G=TPA.global.compmapping.23s$VariantPresent_A2059G_redo), by="Sample_Name")


TPA.meta1.2.pinecone.23S.counts <- data.frame(TPA.meta1.2.pinecone.23S %>% 
                                                dplyr::group_by(TPA.pinecone.sublineage, A2058G, A2059G) %>%
                                                dplyr::summarise(Count=n()),stringsAsFactors = F)
TPA.meta1.2.pinecone.23S.counts <- reshape2::melt(TPA.meta1.2.pinecone.23S.counts,id.vars=c("TPA.pinecone.sublineage","Count"))


TPA.meta1.2.pinecone.23S.counts$TPA.pinecone.sublineage <- factor(TPA.meta1.2.pinecone.23S.counts$TPA.pinecone.sublineage, levels=rev(sublineages.cols.brew$sublineage))

#TPA.meta1.2.pinecone.23S.counts <- TPA.meta1.2.pinecone.23S.counts[TPA.meta1.2.pinecone.23S.counts$TPA.pinecone.sublineage!="Singleton",]

TPA.meta1.2.pinecone.23S.counts$value <- ifelse(TPA.meta1.2.pinecone.23S.counts$value=="Hetero", "Uncertain",TPA.meta1.2.pinecone.23S.counts$value)

# Plot SNPs by sublineage
p.sublineage.23S.compmap <- ggplot(TPA.meta1.2.pinecone.23S.counts, aes(Count, TPA.pinecone.sublineage, fill=value, color=NULL)) +
  geom_barh(stat="identity", position="fill",width=0.75) + 
  facet_grid(.~variable) + 
  theme_light() +
  scale_fill_manual(values=c("black","grey90","grey50"), breaks=c("Yes","No","Uncertain")) +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.x=element_text(color = "grey25",angle=0, size=10)) +
  labs(x="Proportion of samples",y="Sublineage", fill="Resistance\nallele\npresent") +
  theme.text.size + theme(legend.key.size = unit(0.75,"line"),legend.position='bottom') 
#p.sublineage.23S.compmap



# Want to include Singletons for this analyis  - Redo country sublineage distros
sublineage.country.counts.incSing <- plyr::join(TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Sample_Year!="-"),c("Sample_Name","TPA.pinecone.sublineage")], sublineage.classification, by="TPA.pinecone.sublineage",type="left")
sublineage.country.counts.incSing[is.na(sublineage.country.counts.incSing$private.distro),"private.distro"] <- "Singleton"
sublineage.country.counts.incSing <- sublineage.country.counts.incSing %>% group_by(TPA.pinecone.sublineage, private.distro) %>% summarise(Count=n())
sublineage.country.counts.incSing$TPA.pinecone.sublineage <- factor(sublineage.country.counts.incSing$TPA.pinecone.sublineage,levels=rev(sublineages.cols.brew$sublineage))

p.sublineage.private.hbarplot.incSing <- ggplot(sublineage.country.counts.incSing, aes(Count,TPA.pinecone.sublineage,fill=private.distro)) +
  geom_barh(stat="identity", position="stack", width=0.75) +
  theme_light() +
  #scale_x_log10() +
  coord_cartesian(xlim=c(0,410)) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='bottom') +
  scale_fill_manual(breaks=(unique(sublineage.country.counts.incSing$private.distro)), values=rev(c("grey80","grey50","grey10"))) +
  labs(y="Sublineage", x="Sample Count", fill="Sublineage\nType") +
  geom_text(data=sublineage.country.counts.incSing, aes((Count+20), TPA.pinecone.sublineage,label=Count), size=2.5, inherit.aes = F)
#p.sublineage.private.hbarplot.incSing


#plot_grid(p.sublineage.hbarplot, p.sublineage.23S.compmap + y.theme.strip, align='h', axis='tb', rel_widths =c(1,2))
p.sublineage.23S.compmap.distributions <- plot_grid(p.sublineage.private.hbarplot.incSing, p.sublineage.23S.compmap + y.theme.strip, align='h', axis='tb', rel_widths =c(1,1))


p.sublineage.23S.compmap.distributions

Plot distros with tree

plot.MLtree.23s.with.sublineage.distros.combo <- plot_grid(p.MLtree.23S.distros, p.sublineage.23S.compmap.distributions, ncol=1, rel_heights=c(3,2), labels=c('A','B'), label_size=11)

#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure14_Sublineage_vs_macrolide-distros__02-2021.svg"), width = 800, height = 1000,type="svg",units = "pt")
plot.MLtree.23s.with.sublineage.distros.combo

#dev.off()

Look at temporal distribution of samples



TPA.meta1.2.pinecone.23s.simpledates <- plyr::join(TPA.meta1.2.pinecone[,c("Sample_Name","TPA.pinecone.sublineage","Sample_Year")],TPA.global.compmapping.23s, by="Sample_Name")

TPA.meta1.2.pinecone.23s.simpledates <- data.frame(TPA.meta1.2.pinecone.23s.simpledates %>% group_by(resistant,TPA.pinecone.sublineage,Sample_Year) %>%
  summarise(Count=n()), stringsAsFactors = F)


TPA.meta1.2.pinecone.23s.simpledates <- TPA.meta1.2.pinecone.23s.simpledates[TPA.meta1.2.pinecone.23s.simpledates$TPA.pinecone.sublineage!="Singleton",]

TPA.meta1.2.pinecone.23s.simpledates <- plyr::join(data.frame(Sample_Year=c(1912:2019),stringsAsFactors=F), TPA.meta1.2.pinecone.23s.simpledates)
TPA.meta1.2.pinecone.23s.simpledates <- TPA.meta1.2.pinecone.23s.simpledates[!is.na(TPA.meta1.2.pinecone.23s.simpledates$TPA.pinecone.sublineage),]
TPA.meta1.2.pinecone.23s.simpledates <- TPA.meta1.2.pinecone.23s.simpledates[!is.na(TPA.meta1.2.pinecone.23s.simpledates$resistant),]



TPA.meta1.2.pinecone.23s.simpledates$TPA.pinecone.sublineage <- factor(TPA.meta1.2.pinecone.23s.simpledates$TPA.pinecone.sublineage, levels=(sublineages.cols.brew$sublineage))


#ggplot(TPA.meta1.2.pinecone.23s.simpledates, aes(Sample_Year, TPA.pinecone.sublineage, color=resistant, size=Count)) +
#  geom_point()

#ggplot(TPA.meta1.2.pinecone.23s.simpledates, aes(Sample_Year, TPA.pinecone.sublineage, color=resistant, size=Count)) +
#  geom_jitter()


p.bubbleplot.sublineage.resistance.alleles <- ggplot(TPA.meta1.2.pinecone.23s.simpledates, aes(Sample_Year, resistant, color=resistant)) +
  geom_point(alpha=0.65, aes(size=Count)) + 
  theme_light() +
  facet_wrap(TPA.pinecone.sublineage~.) +
  coord_cartesian(xlim=c(1970,2020)) + 
  scale_x_continuous(breaks=seq(1970,2020,20)) +
  scale_size_area(max_size = 8,breaks=c(1,5,10,25,50,75,100)) +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.x=element_text(color = "grey25",angle=0, size=10)) +
  theme.text.size + theme(legend.key.size = unit(0.75,"line"),legend.position='top') +
  #scale_color_manual(values=c("black","black","grey90","grey50"), breaks=c("A2058G","A2059G","Sensitive","Uncertain"))
  scale_color_manual(name="Resistance\nAllele", values=c("red","blue","black","grey75"), breaks=c("A2058G","A2059G","Sensitive","Uncertain")) +
  geom_vline(xintercept = 2000, color='blue', alpha=0.25) +
  labs(x="Sample Year", y="Resistance Allele")
  
p.bubbleplot.sublineage.resistance.alleles

NA
NA
NA

Look at genetic distance v.s. geographic distance (need to infer geographic distances between samples)

Since specific within-country gps data is limited or unavailable, will infer geographic distance between country centroids (already inferred above for map) - crude but may provide some insights.

Plot together

TPA.alignment.data.dist.melt.meta2 <- plyr::join(TPA.alignment.data.dist.melt.meta,data.frame(Geo_Country.t1=country.coords.subset$Geo_Country, Long1=country.coords.subset$centroid.lon, Lat1=country.coords.subset$centroid.lat, stringsAsFactors = F), type="left", by="Geo_Country.t1")
TPA.alignment.data.dist.melt.meta2 <- plyr::join(TPA.alignment.data.dist.melt.meta2,data.frame(Geo_Country.t2=country.coords.subset$Geo_Country, Long2=country.coords.subset$centroid.lon, Lat2=country.coords.subset$centroid.lat, stringsAsFactors = F), type="left", by="Geo_Country.t2")

# Use geosphere package (distVincentyEllipsoid) to calculate geographic distance between points in km
TPA.alignment.data.dist.melt.meta2$Geographic.Distance <- geosphere::distVincentyEllipsoid(TPA.alignment.data.dist.melt.meta2[,c("Long1","Lat1")],TPA.alignment.data.dist.melt.meta2[,c("Long2","Lat2")])/1000

Now plot

# Genetic Distance v.s. Geographic Distance (same Lineages)
p.geographic.vs.genetic.distance.hex.Lineage <- ggplot(TPA.alignment.data.dist.melt.meta2[TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same",], aes(Geographic.Distance,Distance)) +
#ggplot(TPA.alignment.data.dist.melt.meta2, aes(Geographic.Distance,Distance)) + 
  stat_bin_hex(colour="white", na.rm=TRUE, bins = 20) +
  scale_fill_gradientn(colours=c("purple","green"), 
                       name = "Comparison Frequency", breaks=c(3,100,3000),
                       na.value=NA, trans="log10") + 
  theme_light() +
  scale_y_continuous(breaks=seq(0,250,10)) +
  labs(y="Pairwise genetic distance (SNPs)", x="Pairwise geographic distance (kilometers)") +
  theme.text.size + theme(legend.key.size = unit(0.75,"line")) +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.x=element_text(color="grey25", size=10), strip.text.y=element_text(color="grey25", size=10)) +
  theme(legend.position="bottom") +
  #ggtitle("Pairwise SNPs (same sublineage) and Years within and between British Columbia (Canada) and England (UK)") + theme(plot.title = element_text(size = 10)) +
  #facet_grid(TPA_Lineage.t1~same.continent) +
  #facet_grid(.~same.continent) +
  facet_grid(.~TPA_Lineage.t1) +
  NULL
p.geographic.vs.genetic.distance.hex.Lineage <- p.geographic.vs.genetic.distance.hex.Lineage + stat_smooth(method='lm', fullrange=F,se=T, color='black', level=95)

p.geographic.vs.genetic.distance.hex.Lineage

Calculate Pearson’s correlation (for real dataset)

# For whole dataset (but only looking within same Lineage)
real.correlation1 <- cor(TPA.alignment.data.dist.melt.meta2[TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same","Distance"],TPA.alignment.data.dist.melt.meta2[TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same","Geographic.Distance"])
real.correlation1
[1] 0.240001
nrow(TPA.alignment.data.dist.melt.meta2[TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same",])
[1] 191096
# Explicitly By Lineage

# Nichols
real.correlation1.Nichols <- cor(TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta2$TPA_Lineage.t1=="Nichols"),"Distance"],TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta2$TPA_Lineage.t1=="Nichols"),"Geographic.Distance"])
real.correlation1.Nichols
[1] 0.486289
nrow(TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta2$TPA_Lineage.t1=="Nichols"),])
[1] 9620
# SS14
real.correlation1.SS14 <- cor(TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta2$TPA_Lineage.t1=="SS14"),"Distance"],TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta2$TPA_Lineage.t1=="SS14"),"Geographic.Distance"])
real.correlation1.SS14
[1] 0.3091876
nrow(TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta2$TPA_Lineage.t1=="SS14"),])
[1] 181476

So looking at the full dataset

# Accross whole dataset
Correlation.gen.v.geo.dist.all.samples <- calculate.genetic.vs.geographic.distance.correlation(TPA.alignment.data.dist.melt.meta2)
Correlation.gen.v.geo.dist.all.samples[1,]

# Accross whole dataset, but constrained to genetic distances within lineage
Correlation.gen.v.geo.dist.all.samples.Lineage <- calculate.genetic.vs.geographic.distance.correlation(TPA.alignment.data.dist.melt.meta2[TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same",])
Correlation.gen.v.geo.dist.all.samples.Lineage[1,]

# Explicitly By Lineage
Correlation.gen.v.geo.dist.Nichols.Lineage <- calculate.genetic.vs.geographic.distance.correlation(TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta2$TPA_Lineage.t1=="Nichols"),])
Correlation.gen.v.geo.dist.Nichols.Lineage[1,]

Correlation.gen.v.geo.dist.SS14.Lineage <- calculate.genetic.vs.geographic.distance.correlation(TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta2$TPA_Lineage.t1=="SS14"),])
Correlation.gen.v.geo.dist.SS14.Lineage[1,]


# Explicitly By sublineage

Now do it within major sublineages)

# Genetic Distance v.s. Geographic Distance (same sublineages)
TPA.alignment.data.dist.melt.meta2$TPA.pinecone.sublineage.t1 <- factor(TPA.alignment.data.dist.melt.meta2$TPA.pinecone.sublineage.t1, levels=sublineages.cols.brew$sublineage)


sublineage_names <- c(`1`="Sublineage 1",`2`="Sublineage 2",`8`="Sublineage 8",`14`="Sublineage 14")


p.geographic.vs.genetic.distance.hex.sublin <- ggplot(TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.Pinecone.cluster=="same" & TPA.alignment.data.dist.melt.meta2$TPA.pinecone.sublineage.t1 %in% c(1,2,8,14)),], aes(Geographic.Distance,Distance)) +
#ggplot(TPA.alignment.data.dist.melt.meta2, aes(Geographic.Distance,Distance)) + 
  stat_bin_hex(colour="white", na.rm=TRUE, bins = 20) +
  #geom_density_2d_filled() + scale_fill_brewer(palette="PuRd") +
  scale_fill_gradientn(colours=c("purple","green"), 
                       name = "Comparison Frequency", breaks=c(3,100,3000),
                       na.value=NA, trans="log10") + 
  theme_light() +
  scale_y_continuous(breaks=seq(0,250,10)) +
  labs(y="Pairwise genetic distance (SNPs)", x="Pairwise geographic distance (kilometers)") +
  theme.text.size + theme(legend.key.size = unit(0.75,"line")) +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.x=element_text(color="grey25", size=10), strip.text.y=element_text(color="grey25", size=10)) +
  theme(legend.position="bottom") +
  #ggtitle("Pairwise SNPs (same sublineage) and Years within and between British Columbia (Canada) and England (UK)") + theme(plot.title = element_text(size = 10)) +
  facet_wrap(vars(TPA.pinecone.sublineage.t1), labeller=as_labeller(sublineage_names)) +
  stat_smooth(method='lm', fullrange=F,se=T, color='black', level=95) +
  NULL


p.geographic.vs.genetic.distance.hex.sublin 

NA
NA
NA

Combine lineage and sublineage distance plots

plot.geographic.distance.within.Lin.Sublin <- plot_grid(p.geographic.vs.genetic.distance.hex.Lineage, p.geographic.vs.genetic.distance.hex.sublin + theme(legend.position='none'), ncol=1, rel_heights=c(3,4), labels=c('A','B'), label_size=11) 


#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure12_Genetic-vs-Geographic_distance__02-2021.svg"), width = 600, height = 600,type="svg",units = "pt")
plot.geographic.distance.within.Lin.Sublin

#dev.off()

Create Supplementary Metadata output file

#colnames(TPA.meta1.2)
TPA.meta1.2.final.Supplementary <- TPA.meta1.2[,c("Sample_Name", "Sanger_Lane_ID_raw", "Cleaned_fastq_id", "Cleaned_fastq_readcount","SRR/ENA_Accession","Reads_or_assemblies","Species","Sample_Year","Citation","Sample_Type","Clinical","Direct_from_clin","Duplicate","Geo_Region","Geo_Country","Continent","TPA_Lineage","Proportion-N_>5_mapping+masking_Nichols","Mapping_Good<25%N","Mapping_Terrible>75%N","Mean_mapping_coverage")]

# Apend info about which samples were used in the finescale clustering analysis
#colnames(TPA.meta1.2.pinecone)
TPA.meta1.2.final.Supplementary <- plyr::join(TPA.meta1.2.final.Supplementary,TPA.meta1.2.pinecone[,c("Sample_Name","TPA.pinecone.sublineage")], by="Sample_Name", type="full")
TPA.meta1.2.final.Supplementary$finescale.analysis <- ifelse(is.na(TPA.meta1.2.final.Supplementary$TPA.pinecone.sublineage),"No","Yes")

# Append info about which samples were used in the temporal BEAST analysis
in.beast.tree <- data.frame(Sample_Name=full.beast2.tipnames$meta.name,stringsAsFactors=F)
in.beast.tree$full.temporal.analysis <- "Yes"
TPA.meta1.2.final.Supplementary <- plyr::join(TPA.meta1.2.final.Supplementary, in.beast.tree, by="Sample_Name", type="full")
TPA.meta1.2.final.Supplementary$full.temporal.analysis <- ifelse(is.na(TPA.meta1.2.final.Supplementary$full.temporal.analysis),"No","Yes")

# Append antimicrobial resistance information
TPA.meta1.2.final.Supplementary <- plyr::join(TPA.meta1.2.final.Supplementary, data.frame(Sample_Name=TPA.global.compmapping.23s$Sample, A2058G=TPA.global.compmapping.23s$VariantPresent_A2058G_redo, A2059G=TPA.global.compmapping.23s$VariantPresent_A2059G_redo, stringsAsFactors=F), by="Sample_Name", type="full")

# Relabel citation for novel sequences

TPA.meta1.2.final.Supplementary[TPA.meta1.2.final.Supplementary$Citation=="unpublished-WSI","Citation"] <- "This_Study"
TPA.meta1.2.final.Supplementary[TPA.meta1.2.final.Supplementary$Citation=="unpublished-Taiaroa","Citation"] <- "This_Study"


# Reorder dataframe by Citation
TPA.meta1.2.final.Supplementary <- TPA.meta1.2.final.Supplementary[order(TPA.meta1.2.final.Supplementary$Citation,TPA.meta1.2.final.Supplementary$Sample_Name),]


#write.csv(TPA.meta1.2.final.Supplementary, file=paste0(Figure_output_directory, "Supplementary_Data1_Sample-Metadata__03-2021.csv"), row.names = F)

ENA assembly submissions (high quality assemblies - assemblies not actually used for paper, but will publish separately for the community)


#ENA.assemblies <- TPA.meta1.2[(TPA.meta1.2$`CheckM>95%_Completeness`=='yes' & TPA.meta1.2$`CheckM<5%-contamination`=='yes' & TPA.meta1.2$`contigs_<600`=='yes' & TPA.meta1.2$`Mapping_Good<25%N`=='Yes' & TPA.meta1.2$Citation=="unpublished-WSI"),c("Sample_Name", "Sanger_Lane_ID_raw", "Cleaned_fastq_id", "Cleaned_fastq_readcount","SRR/ENA_Accession","Reads_or_assemblies","Species","Sample_Year","Citation","Sample_Type","Clinical","Direct_from_clin","Duplicate","Geo_Region","Geo_Country","Continent","TPA_Lineage","Proportion-N_>5_mapping+masking_Nichols","Mapping_Good<25%N","Mapping_Terrible>75%N","Mean_mapping_coverage","SPAdes-pilon_assembly_id","Assembly_#_contigs","Assembly_N50","CheckM-completeness","CheckM-contamination")]


#ENA.assemblies <- ENA.assemblies[ENA.assemblies$`Assembly_#_contigs`<250,]
#ENA.assemblies <- ENA.assemblies[ENA.assemblies$`CheckM-contamination`<2.5,]
#ENA.assemblies <- ENA.assemblies[!is.na(ENA.assemblies$Sample_Name),]


c("NL12", "PHE120006A", "PHE120007A", "PHE120009B", "PHE120011A", "PHE120021A", "PHE120024A", "PHE130036A", "PHE130039A", "PHE130040A", "PHE130043A", "PHE130045A", "PHE130047A", "PHE130052A", "PHE130053A", "PHE130064A", "PHE140073A", "PHE140074A", "PHE140084A", "PHE140085A", "PHE140093A", "PHE140095A", "PHE150110A", "PHE150119A", "PHE150121A", "PHE150122A", "PHE150126A", "PHE150130A", "PHE150131A", "PHE150133A", "PHE150137A", "PHE150138A", "PHE150143A", "PHE150145A", "PHE150148A", "PHE150149A", "PHE150153A", "PHE150160A", "PHE150161A", "PHE150162A", "PHE150166A", "PHE150177A", "PHE160190A", "PHE160196A", "PHE160197A", "PHE160198A", "PHE160203A", "PHE160211A", "PHE160214A", "PHE160217A", "PHE160224A", "PHE160239A", "PHE160240A", "PHE160243A", "PHE160249A", "PHE160253A", "PHE160256A", "PHE160259A", "PHE160262A", "PHE160264A", "PHE160277A", "PHE160280A", "PHE160287A", "PHE160290A", "PHE160298A", "PHE160299A", "PHE160309A", "PHE160312A", "PHE160315A", "PHE160316A", "PHE170328A", "PHE170329A", "PHE170333A", "PHE170336B", "PHE170346A", "PHE170349A", "PHE170351A", "PHE170352A", "PHE170356A", "PHE170366A", "PHE170370A", "PHE170372A", "PHE170374A", "PHE170380A", "PHE170381A", "PHE170387A", "PHE170388A", "PHE170398A", "PHE170402A", "PHE170403A", "PHE170405A", "PHE170407A", "PHE170408A", "TPA_ALC015", "TPA_ALC034", "TPA_ALC077", "TPA_BCC004", "TPA_BCC005", "TPA_BCC008", "TPA_BCC009", "TPA_BCC012", "TPA_BCC014", "TPA_BCC023", "TPA_BCC030", "TPA_BCC032", "TPA_BCC034", "TPA_BCC040", "TPA_BCC049", "TPA_BCC052", "TPA_BCC055", "TPA_BCC058", "TPA_BCC061", "TPA_BCC063", "TPA_BCC064", "TPA_BCC075", "TPA_BCC079", "TPA_BCC085", "TPA_BCC088", "TPA_BCC101", "TPA_BCC102", "TPA_BCC106", "TPA_BCC108", "TPA_BCC109", "TPA_BCC111", "TPA_BCC122", "TPA_BCC127", "TPA_BCC128", "TPA_BCC129", "TPA_BCC130", "TPA_BCC132", "TPA_BCC134", "TPA_BCC137", "TPA_BCC139", "TPA_BCC140", "TPA_BCC141", "TPA_reBCC165", "TPA_BCC166", "TPA_BCC174", "TPA_BCC175", "TPA_BCC181", "TPA_BCC185", "TPA_BCC186", "TPA_BCC187", "TPA_BCC196", "TPA_BCC197", "TPA_BCC198", "TPA_BCC199", "TPA_EIR008", "TPA_EIR013", "TPA_EIR015", "TPA_EIR017", "TPA_ESBCN005", "TPA_OMI002", "TPA_OMI015", "TPA_OMI021", "TPA_OMI075", "TPA_UKBIR026", "TPA_UKBIR028", "TPA_UKBIR044", "TPA_UKBIR052", "TPA_UKBRG004", "TPA_UKBRG008", "TPA_UKBRG010", "TPA_UKBRG012", "TPA_UKBRG017", "TPA_UKBRG018", "TPA_UKLEE004", "TPA_UKMAN003", "TPA_UKMAN019", "TPA_UKMAN027", "TPA_UKMAN047", "TPA_UKMAN054", "TPA_USL-BAL-2", "TPA_USL-BAL-6", "TPA_USL-BAL-7", "TPA_USL-BAL-8", "TPA_USL-Grady-1", "TPA_USL-Haiti-B", "TPA_USL-Phil-1", "TPA_USL-Phil-3", "TPA_USL-SEA-81-3", "TPA_USL-SEA-81-8", "TPA_USL-SEA-83-1", "TPA_USL-SEA-83-2", "TPA_USL-SEA-84-2", "TPA_USL-SEA-86-1", "TPA_USL-SEA-87-1", "TPA_ZIM005", "TPA_ZIM007", "TPA_ZIM018", "TPA_ZIM024", "TPA_ZIM025", "TPA_ZIM028", "UW202B", "TPA_ALC105", "TPA_BCC103", "TPA_OMI006", "TPA_OMI022", "TPA_OMI029", "TPA_OMI033", "TPA_ZIM019", "TPA_HUN190022", "TPA_HUN200024", "TPA_HUN190020", "TPA_RUS_Tuva-39", "TPA_RUS_Tuva-58", "TPA_RUS_Tuva-59", "TPA_RUS_Tuva-26", "TPA_RUS_Tuva-41", "TPA_SWE-996", "TPA_AUSBR-41")
  [1] "NL12"             "PHE120006A"       "PHE120007A"       "PHE120009B"       "PHE120011A"       "PHE120021A"       "PHE120024A"      
  [8] "PHE130036A"       "PHE130039A"       "PHE130040A"       "PHE130043A"       "PHE130045A"       "PHE130047A"       "PHE130052A"      
 [15] "PHE130053A"       "PHE130064A"       "PHE140073A"       "PHE140074A"       "PHE140084A"       "PHE140085A"       "PHE140093A"      
 [22] "PHE140095A"       "PHE150110A"       "PHE150119A"       "PHE150121A"       "PHE150122A"       "PHE150126A"       "PHE150130A"      
 [29] "PHE150131A"       "PHE150133A"       "PHE150137A"       "PHE150138A"       "PHE150143A"       "PHE150145A"       "PHE150148A"      
 [36] "PHE150149A"       "PHE150153A"       "PHE150160A"       "PHE150161A"       "PHE150162A"       "PHE150166A"       "PHE150177A"      
 [43] "PHE160190A"       "PHE160196A"       "PHE160197A"       "PHE160198A"       "PHE160203A"       "PHE160211A"       "PHE160214A"      
 [50] "PHE160217A"       "PHE160224A"       "PHE160239A"       "PHE160240A"       "PHE160243A"       "PHE160249A"       "PHE160253A"      
 [57] "PHE160256A"       "PHE160259A"       "PHE160262A"       "PHE160264A"       "PHE160277A"       "PHE160280A"       "PHE160287A"      
 [64] "PHE160290A"       "PHE160298A"       "PHE160299A"       "PHE160309A"       "PHE160312A"       "PHE160315A"       "PHE160316A"      
 [71] "PHE170328A"       "PHE170329A"       "PHE170333A"       "PHE170336B"       "PHE170346A"       "PHE170349A"       "PHE170351A"      
 [78] "PHE170352A"       "PHE170356A"       "PHE170366A"       "PHE170370A"       "PHE170372A"       "PHE170374A"       "PHE170380A"      
 [85] "PHE170381A"       "PHE170387A"       "PHE170388A"       "PHE170398A"       "PHE170402A"       "PHE170403A"       "PHE170405A"      
 [92] "PHE170407A"       "PHE170408A"       "TPA_ALC015"       "TPA_ALC034"       "TPA_ALC077"       "TPA_BCC004"       "TPA_BCC005"      
 [99] "TPA_BCC008"       "TPA_BCC009"       "TPA_BCC012"       "TPA_BCC014"       "TPA_BCC023"       "TPA_BCC030"       "TPA_BCC032"      
[106] "TPA_BCC034"       "TPA_BCC040"       "TPA_BCC049"       "TPA_BCC052"       "TPA_BCC055"       "TPA_BCC058"       "TPA_BCC061"      
[113] "TPA_BCC063"       "TPA_BCC064"       "TPA_BCC075"       "TPA_BCC079"       "TPA_BCC085"       "TPA_BCC088"       "TPA_BCC101"      
[120] "TPA_BCC102"       "TPA_BCC106"       "TPA_BCC108"       "TPA_BCC109"       "TPA_BCC111"       "TPA_BCC122"       "TPA_BCC127"      
[127] "TPA_BCC128"       "TPA_BCC129"       "TPA_BCC130"       "TPA_BCC132"       "TPA_BCC134"       "TPA_BCC137"       "TPA_BCC139"      
[134] "TPA_BCC140"       "TPA_BCC141"       "TPA_reBCC165"     "TPA_BCC166"       "TPA_BCC174"       "TPA_BCC175"       "TPA_BCC181"      
[141] "TPA_BCC185"       "TPA_BCC186"       "TPA_BCC187"       "TPA_BCC196"       "TPA_BCC197"       "TPA_BCC198"       "TPA_BCC199"      
[148] "TPA_EIR008"       "TPA_EIR013"       "TPA_EIR015"       "TPA_EIR017"       "TPA_ESBCN005"     "TPA_OMI002"       "TPA_OMI015"      
[155] "TPA_OMI021"       "TPA_OMI075"       "TPA_UKBIR026"     "TPA_UKBIR028"     "TPA_UKBIR044"     "TPA_UKBIR052"     "TPA_UKBRG004"    
[162] "TPA_UKBRG008"     "TPA_UKBRG010"     "TPA_UKBRG012"     "TPA_UKBRG017"     "TPA_UKBRG018"     "TPA_UKLEE004"     "TPA_UKMAN003"    
[169] "TPA_UKMAN019"     "TPA_UKMAN027"     "TPA_UKMAN047"     "TPA_UKMAN054"     "TPA_USL-BAL-2"    "TPA_USL-BAL-6"    "TPA_USL-BAL-7"   
[176] "TPA_USL-BAL-8"    "TPA_USL-Grady-1"  "TPA_USL-Haiti-B"  "TPA_USL-Phil-1"   "TPA_USL-Phil-3"   "TPA_USL-SEA-81-3" "TPA_USL-SEA-81-8"
[183] "TPA_USL-SEA-83-1" "TPA_USL-SEA-83-2" "TPA_USL-SEA-84-2" "TPA_USL-SEA-86-1" "TPA_USL-SEA-87-1" "TPA_ZIM005"       "TPA_ZIM007"      
[190] "TPA_ZIM018"       "TPA_ZIM024"       "TPA_ZIM025"       "TPA_ZIM028"       "UW202B"           "TPA_ALC105"       "TPA_BCC103"      
[197] "TPA_OMI006"       "TPA_OMI022"       "TPA_OMI029"       "TPA_OMI033"       "TPA_ZIM019"       "TPA_HUN190022"    "TPA_HUN200024"   
[204] "TPA_HUN190020"    "TPA_RUS_Tuva-39"  "TPA_RUS_Tuva-58"  "TPA_RUS_Tuva-59"  "TPA_RUS_Tuva-26"  "TPA_RUS_Tuva-41"  "TPA_SWE-996"     
[211] "TPA_AUSBR-41"    
---
title: "Global TPA Uber Analysis 2020/2021, April 2021"
output: html_notebook
---

Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Cmd+Option+I*.
Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 


# Read in dependencies 
```{r}
library(ggplot2)
library(treeio)
library(ggtree)
library(ggnewscale)
library(plyr)
library(dplyr)
library(tidyverse)
library(phytools)
library(randomcoloR)
library(RColorBrewer)
library(lubridate)
library(readxl)
library(ggforce)
library(ggstance)
library(ggridges)
library(Cairo)
library(cowplot)
library(ggmap)
library(CoordinateCleaner)
library(gridExtra)
library(hexbin)
library(emojifont)
library(scales)

library(pairsnp)
library(rPinecone)

R.Version()
print(sessionInfo())
```


# Read in data
```{r}

# ML Tree (using raw sequences with minimal filters)
TPA.rawseq.ML.tree.file <- "TPA-uber.remasked.2020-11-10.lowcov75.SNPs.aln.renamed.treefile"

# ML tree (refined dataset)
TPA.MLtree.file <- "TPA-uber.remasked.2020-11-10.goodcov25.gubbins.SNPs.aln.renamed.treefile"

# Pyjar tree (refined dataset)
TPA.pyjar.file <- "TPA-uber.remasked.2020-11-10.goodcov25.gubbins.SNPs.aln.renamed.pyjar.tre"

# Multiple Sequence alignment of SNPs for ML tree/pyjar
TPA.MSA.SNPs.aln.file <- "TPA-uber.remasked.2020-11-10.goodcov25.gubbins.SNPs.renamed.aln"

# Pinecone clustering bootstrap data
pinecone.10.file <- "TPA-uber.bootstrapped-pinecone.10-3.2021-01-27.v2.pinecone.bootstrap.table.csv"

# Master Metadata spreadsheet
#TPA.meta1.file <- "/Users/mb29/Treponema/Expanded_Global_Sequencing/Global_Sequence_Collection_Info_update-03-2021.xlsx"
TPA.meta1.file <- "Supplementary_Data1_Sample-Metadata__03-2021.xlsx"


# Some population prevalence data
UK.stats.file <- "PHE_2019_UK_Syphilis-rate-per-100k-pop__02-11-2020.tsv"
BC.stats.file <- "BCCDC-Canada__BC-Syphilis-rate-per-100k-pop_2017___02-11-2020.tsv"

# BEAST Analyses
# Subsampled BEAST analysis 1
TPA.beast.subtree.file <- "TPA-uber.remasked.2020-11-10.gubbins.subsampled.2020-11-18.WGS.rendates.noInv_StrictCSkyline_combined.2020-11-23.consensus.tree"

beast.subtree.skyline.file <- "TPA-uber.remasked.2020-11-10.gubbins.subsampled.2020-11-18.WGS.rendates.noInv_StrictCSkyline_1.export_skyline_data.txt"

beast.subtree.skyline.lineage.file <- "TPA-uber_beast2_strict-skyline-500M_10pop_combined.lineages-data.tsv"


# Subsampled BEAST analysis 2 (repeat)
repeat.subsampled.skyline.tree.file <- "TPA-uber.remasked.2020-11-10.gubbins.subsampled.2.2020-11-23.WGS.rendates.noInv.Strict-Skyline-HKY_combined.2020-11-26.consensus.tree"

repeat.subsampled.skyline.data.file <- "TPA-uber.remasked.2020-11-10.gubbins.subsampled.2.2020-11-23.WGS.rendates.noInv.Strict-Skyline-HKY_1.skyline-data.tsv"
repeat.subsampled.skyline.lineages.data.file <- "TPA-uber.remasked.2020-11-10.gubbins.subsampled.2.2020-11-23.WGS.rendates.noInv.Strict-Skyline-HKY_1.skyline-lineages-data.tsv"

# Full size BEAST2 analysis
full.beast2.tree.file <- "TPA-uber_beast2_strict-skyline-500M_10pop_consensus.tree"

beast2.runs.filepath <- "./"

beast2.full.skyline.path <- "TPA-uber_beast2_strict-skyline-500M_10pop_combined.skyline-data.tsv"
beast2.full.lineages.path <- "TPA-uber_beast2_strict-skyline-500M_10pop_combined.lineages-data.tsv"
beast2.full.popdistro.path <- "TPA-uber_beast2_strict-skyline-500M_10pop_combined.pop-distributions_p100.txt"


beast2.pop.decline.file <- "TPA-uber_beast2_strict-skyline-500M_10pop_pop-decline.1990-2015.p50_population_change_distribution.csv"


beast2.pop.increase.file  <- "TPA-uber_beast2_strict-skyline-500M_10pop_pop-increase.1990-2015.p100_population_change_distribution.csv"

pop.decline.supporting.trees.file <- "TPA-uber_beast2_strict-skyline-500M_10pop_pop-decline.1990-2015.p50_trees_supporting.nex"

pop.decline.notsupporting.trees.file <- "TPA-uber_beast2_strict-skyline-500M_10pop_pop-decline.1990-2015.p50_trees_not_supporting.nex"


# Sublineage BEAST analysis
sublineage.skylines.filepath <- "./"
pop.distro.path <- "./" 
pop.distro.sublin.1.file <- "TPA-uber.remasked.2020-11-10.goodcov25.gubbins.WGS.sublineage.new.1.noinv.Strict-Skyline_combined.pop-expansion"
pop.distro.sublin.2.file <- "TPA-uber.remasked.2020-11-10.goodcov25.gubbins.WGS.sublineage.new.2.noinv.Strict-Skyline_combined.pop-expansion"
pop.distro.sublin.8.file <- "TPA-uber.remasked.2020-11-10.goodcov25.gubbins.WGS.sublineage.new.8.noinv.Strict-Skyline_combined.pop-expansion"

# Recombination analysis
recombination_event.file <- "Supplementary_Table_Recombination-Events_2021-03-25.xlsx"

# Tip Date Randomisation
random.tip.summary.file <- "clockRate.stats.csv"

# Macrolide resistance allele calls
TPA.global.compmapping.23s.file <-"competitive-mapping_combined-reports.all.2020-12-01.final.tsv"

# Pinecone assignments
pinecone.clusters.MLoriginal.file <- "Global-TPA.goodcov.rPinecone10-3.assignments_2020-11-11.csv"

WGS.site.positions.file <- "TPA-uber.remasked.2020-11-10.goodcov25.gubbins.SNPs.site-positions.txt"

# Location to save files to
Figure_output_directory <- "/Users/mb29/Papers/Global_Treponema_Uber-Paper_2020/Figures/Figure_Drafting/"
```


# Read in trees
```{r}
TPA.rawseq.ML.tree <- midpoint.root(read.tree(TPA.rawseq.ML.tree.file))
TPA.MLtree <- midpoint.root(read.tree(TPA.MLtree.file))
TPA.pyjar.tree <- midpoint.root(read.tree(TPA.pyjar.file))



```


# make some shortcuts for plotting 
```{r}
y.theme.strip <- theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y= element_blank())
y.theme.strip.partial <- theme(axis.text.y = element_blank(), axis.ticks.y= element_blank())

x.theme.strip <- theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x= element_blank())
x.theme.strip.partial <- theme(axis.text.x = element_blank(), axis.ticks.x= element_blank())
x.theme.strip.labs <- theme(axis.text.x = element_blank(),axis.title.x = element_blank())

x.theme.axis.rotate <- theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

theme.text.size <- theme(text = element_text(size = 10))

'%notin%' <- Negate('%in%')

```


# Filter metadata
```{r}
# First do some cleaning
#TPA.meta1 <- readxl::read_excel(TPA.meta1.file,sheet="All_useable_T.pal_01-2020_refil")
TPA.meta1 <- readxl::read_excel(TPA.meta1.file,sheet="Supplementary_Data1_Sample-Meta")
TPA.meta1 <- subset(TPA.meta1, select=-c(A2058G,A2059G,TPA.pinecone.sublineage))


TPA.meta1.2 <- TPA.meta1[TPA.meta1$Sample_Name %in% TPA.rawseq.ML.tree$tip.label,]

TPA.meta1.2 <- TPA.meta1.2[TPA.meta1.2$Species=="TPA",]
#TPA.meta1.2 <- TPA.meta1.2[TPA.meta1.2$Study_Type=="TPA-Global",]
TPA.meta1.2 <- TPA.meta1.2[TPA.meta1.2$Duplicate!="Yes",]
#TPA.meta1.2 <- TPA.meta1.2[TPA.meta1.2$Sample_Year!="-",]


#TPA.meta1.2 <- TPA.meta1.2[TPA.meta1.2$`SKA_Alignment_terrible>50%?`=="No",]
TPA.meta1.2 <- TPA.meta1.2[!is.na(TPA.meta1.2$Sample_Name),]
#TPA.meta1.2 <- data.frame(TPA.meta1.2,stringsAsFactors = F)


TPA.meta1.2$Geo_Country <- gsub("\\_","\\ ",TPA.meta1.2$Geo_Country)
```


do some date parsing to create date groups
```{r}
floor_5years  <- function(value){ return(value - value %% 5) }
ceiling_5years <- function(value){ return(round_to_5years(value)+5) }
round_to_5years <- function(value){ return(round(value / 5) * 5) }

TPA.meta1.2$Sample_5year.floor <- floor_5years(as.numeric(TPA.meta1.2$Sample_Year))
TPA.meta1.2$Sample_5year.window <- paste0(floor_5years(as.numeric(TPA.meta1.2$Sample_Year)),"-",floor_5years(as.numeric(TPA.meta1.2$Sample_Year))+5)


# Some samples have uncertain dates (up to 20-30 years uncertainty), but for the purposes of these plotting categories we'll use the centrepoint year
TPA.meta1.2$Sample_5year.window <- sapply(1:nrow(TPA.meta1.2), function(x) ifelse(TPA.meta1.2$Sample_Year[x]=="-",NA, ifelse(is.na(TPA.meta1.2$Sample_5year.window[x]),NA, ifelse(TPA.meta1.2$Sample_Year[x]=="1950-1980","1965-1970",ifelse(TPA.meta1.2$Sample_Year[x]=="1960-1980","1965-1970" ,ifelse(TPA.meta1.2$Sample_Year[x]=="1980-1999","1985-1990",TPA.meta1.2$Sample_5year.window[x]))))))

#TPA.meta1.2[TPA.meta1.2$Sample_5year.window=="NA-NA","Sample_5year.window"] <- NA

#View(TPA.meta1.2[,c("Sample_Name","Sample_Year","Sample_5year.window")])


# Make colour scheme for date window
TPA.5year.window.brewcols <- data.frame(window.5year=unique(TPA.meta1.2$Sample_5year.window), stringsAsFactors=F)
TPA.5year.window.brewcols$window.5year <- TPA.5year.window.brewcols[order(TPA.5year.window.brewcols$window.5year),]
TPA.5year.window.brewcols$window.5year <- factor(TPA.5year.window.brewcols$window.5year, levels=TPA.5year.window.brewcols$window.5year)
# set colour scale
TPA.5year.window.brewcols$window.5year.cols <- c("Black",brewer.pal(n=11,"RdYlBu"),"white")

#c("1910-1915","1950-1955","1965-1970,"1970-1975","1975-1980","1980-1985","1985-1990","1990-1995","2000-2005","2005-2010","2010-2015","2015-2020","NA")


# Also create a numeric date year for some calculations
TPA.meta1.2$Sample_Year.num <- as.numeric((ifelse(TPA.meta1.2$Sample_Year=="1950-1980","1965",ifelse(TPA.meta1.2$Sample_Year=="1960-1980","1970",TPA.meta1.2$Sample_Year))))

```

Create a colour scheme for countries and continents
```{r}
# Colouring for country
continental.country.cols.brew2 <- unique(TPA.meta1.2[,c("Geo_Country","Continent")])
continental.country.cols.brew2 <- continental.country.cols.brew2[order(continental.country.cols.brew2$Continent,continental.country.cols.brew2$Geo_Country),]

continental.country.cols.brew2$country.col <- c("#ec7014","#fec44f","#de2d26","#fb6a4a","#bdbdbd","#737373",brewer.pal(n=8,"Purples")[3:8],brewer.pal(n=8,"Blues")[3:8],brewer.pal(n=5,"Greens")[3:5],"#c51b8a","#8c510a")

# Colouring for Continent
continental.cols.brew2 <- data.frame(Continent=sort(unique(TPA.meta1.2$Continent)),stringsAsFactors=F)
continental.cols.brew2$continent.col <- c("#fec44f","#de2d26","#bdbdbd","#2171b5","#74c476","#c51b8a","#ec7014")

# Colouring for TPA Lineage
TPA_Lineage.cols <- data.frame(Lineage=sort(unique(TPA.meta1.2$TPA_Lineage)),stringsAsFactors=F)
#TPA_Lineage.cols$Lineage.col <- c("royalblue2", "grey40", "indianred1")
TPA_Lineage.cols$Lineage.col <- c("royalblue2", "indianred1")
#c("#436eee", "#666666","#ff6a6a")
TPA_Lineage.cols$Lineage <- factor(TPA_Lineage.cols$Lineage, levels=c("Nichols","SS14","outlier"))
#TPA_Lineage.cols$Lineage <- factor(TPA_Lineage.cols$Lineage, levels=c("Nichols","SS14"))


# Lineage Hexcodes
# royalblue2 #436eee
# indianred1 #ff6a6a


#TPA_Lineage.cols[order(TPA_Lineage.cols$Lineage,c("Nichols","SS14","outlier")),]
```

# check labelling issues
```{r}
TPA.rawseq.ML.tree$tip.label[TPA.rawseq.ML.tree$tip.label %notin% TPA.meta1.2$Sample_Name]

TPA.meta1.2$Sample_Name[TPA.meta1.2$Sample_Name %notin% TPA.rawseq.ML.tree$tip.label]
```

```{r}
# There is one very low coverage sample (TPA_BCC144, 47% genome breadth, 7.9X mean coverage) with odd phylogenetic placement - it's SS14, but basal in this analysis. Since the coverage is so low, it's not possible to further investigate this, so classify it here as SS14.
TPA.meta1.2[(TPA.meta1.2$TPA_Lineage=="outlier"),"TPA_Lineage"] <- "SS14"
```




```{r}
# Prepare tree
TPA.rawseq.ML.ggtree <- ggtree(TPA.rawseq.ML.tree,layout = "fan",open.angle = 20, right=T)
TPA.rawseq.ML.ggtree <- ggtree(TPA.rawseq.ML.tree,layout = "fan",open.angle = 15, right=T)

# Prepare country dataset
TPA.rawseq.countries.p <- data.frame(row.names=TPA.meta1.2$Sample_Name, Country=TPA.meta1.2$Geo_Country, stringsAsFactors = F)

# Prepare continent dataset
TPA.rawseq.continents.p <- data.frame(row.names=TPA.meta1.2$Sample_Name, Continent=TPA.meta1.2$Continent, stringsAsFactors = F)

# Prepare Major lineage dataset
TPA.rawseq.Lineage.p <- data.frame(row.names=TPA.meta1.2$Sample_Name, Lineage=TPA.meta1.2$TPA_Lineage, stringsAsFactors = F)
```

```{r}
TPA.rawseq.ML.ggtree.tippoints <- TPA.rawseq.ML.ggtree %<+% data.frame(Sample_Name=rownames(TPA.rawseq.continents.p), Continent=TPA.rawseq.continents.p$Continent, stringsAsFactors = F) + 
  geom_tippoint(aes(color=Continent), size=0.5, alpha=0.5) + 
  scale_color_manual(name="Continent",values=continental.cols.brew2$continent.col, breaks=continental.cols.brew2$Continent) 

# Rescale colours
TPA.rawseq.ML.ggtree.tippoints <- TPA.rawseq.ML.ggtree.tippoints + new_scale_color()

# Plot continent strip
p.TPA.rawseq.ML.ggtree.tippoint.country.cont <- gheatmap(TPA.rawseq.ML.ggtree.tippoints,TPA.rawseq.continents.p, color=NULL,width=0.075,colnames_angle=-45,colnames_offset_y=0.02, hjust=0.0,font.size=2.25) + 
  scale_fill_manual(name="Continent",values=continental.cols.brew2$continent.col, breaks=continental.cols.brew2$Continent) + 
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right') +
  NULL
p.TPA.rawseq.ML.ggtree.tippoint.country.cont <- p.TPA.rawseq.ML.ggtree.tippoint.country.cont + new_scale_fill()



p.TPA.rawseq.ML.ggtree.tippoint.country.cont <- gheatmap(p.TPA.rawseq.ML.ggtree.tippoint.country.cont,TPA.rawseq.countries.p, color=NULL,width=0.075,offset=0.00001225, colnames_angle=-45,colnames_offset_y=0.02, hjust=0.0,font.size=2.25) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right') +
  NULL
p.TPA.rawseq.ML.ggtree.tippoint.country.cont <- p.TPA.rawseq.ML.ggtree.tippoint.country.cont + new_scale_fill()


# Add sublineage
p.TPA.rawseq.ML.ggtree.tippoint.country.cont <- gheatmap(p.TPA.rawseq.ML.ggtree.tippoint.country.cont,TPA.rawseq.Lineage.p, color=NULL,width=0.075,offset=0.00002425, colnames_angle=-45,colnames_offset_y=0.02, hjust=-0.0,font.size=2.25) + 
  scale_fill_manual(name="Lineage",values=TPA_Lineage.cols$Lineage.col, breaks=TPA_Lineage.cols$Lineage) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right') +
  NULL
p.TPA.rawseq.ML.ggtree.tippoint.country.cont <- p.TPA.rawseq.ML.ggtree.tippoint.country.cont + new_scale_fill()

# Plot tree
#p.TPA.rawseq.ML.ggtree.tippoint.country.cont
```
```{r}

#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure1_Global-TPA.low-cov-MLtree_02-20.svg"), width = 1000, height = 1000,type="svg",units = "pt")
p.TPA.rawseq.ML.ggtree.tippoint.country.cont
#dev.off()
```



```{r, fig.width=5, fig.height=3, message=FALSE, warning=FALSE}
TPA.rawseq.country.counts <- TPA.meta1.2[TPA.meta1.2$Sample_Year!="-",] %>% dplyr::group_by(Sample_Year, Geo_Country) %>% 
  dplyr::summarise(Count=n())

TPA.rawseq.country.counts$Sample_Year <- ifelse(TPA.rawseq.country.counts$Sample_Year=="1960-1980","1970",TPA.rawseq.country.counts$Sample_Year)

TPA.rawseq.country.counts <- plyr::join(data.frame(Sample_Year=c(1912:2019),stringsAsFactors=F), TPA.rawseq.country.counts)
TPA.rawseq.country.counts <- TPA.rawseq.country.counts[!is.na(TPA.rawseq.country.counts$Geo_Country),]
TPA.rawseq.country.counts$Geo_Country <- factor(TPA.rawseq.country.counts$Geo_Country, levels=continental.country.cols.brew2$Geo_Country)


p.Country.temporal.bubbleplot <- ggplot(TPA.rawseq.country.counts, aes(Sample_Year, Geo_Country, colour=Geo_Country)) +
  geom_point(alpha=0.65, aes(size=Count)) + 
  #guides(colour=FALSE) +
  theme_light() +
  scale_size_area(max_size = 8,breaks=c(1,5,10,25,50,75,100)) +
  scale_color_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none') +
  coord_cartesian(xlim=c(1950,2020)) +
  labs(y="Country", x="Sample Year")
#p.Country.temporal.bubbleplot

p.Country.temporal.bubbleplot.legend <- get_legend(p.Country.temporal.bubbleplot + theme(legend.key.size = unit(0.65,"line"),legend.position='left'))
```

```{r}
raw.country.counts <- TPA.meta1.2 %>% group_by(Geo_Country) %>% summarise(Count=n())
raw.country.counts$Geo_Country <- factor(raw.country.counts$Geo_Country,levels=continental.country.cols.brew2$Geo_Country)

p.Country.hbarplot <- ggplot(raw.country.counts, aes(Count,Geo_Country,fill=Geo_Country)) +
  geom_barh(stat="identity", position="stack", width=0.75) +
  theme_light() +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none') +
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  geom_text(data=raw.country.counts, aes((Count+30), Geo_Country,label=Count), size=2.5, inherit.aes = F) +
  labs(y="Country", x="Samples/Country") +
  coord_cartesian(xlim=c(0,625))
#p.Country.hbarplot  
```
plot together
```{r, fig.width=6, fig.height=3, message=FALSE, warning=FALSE}
grid.arrange(p.Country.temporal.bubbleplot, p.Country.hbarplot + y.theme.strip, ncol=2, widths=c(3,1))
```


# Now, lets think about just what lineages are where

```{r}
major.lineage.country.summary.simple <- TPA.meta1.2[TPA.meta1.2$TPA_Lineage %in% c("SS14","Nichols"),] %>% dplyr::group_by(TPA_Lineage,Geo_Country) %>% 
  dplyr::summarise(total.samples=n())
major.lineage.country.summary.simple$Geo_Country <- factor(major.lineage.country.summary.simple$Geo_Country, levels=rev(sort(unique(major.lineage.country.summary.simple$Geo_Country)))) 

country.summary.simple <- TPA.meta1.2[TPA.meta1.2$TPA_Lineage %in% c("SS14","Nichols"),] %>% dplyr::group_by(Geo_Country) %>% 
  dplyr::summarise(total.samples=n())
country.summary.simple$Geo_Country <- factor(country.summary.simple$Geo_Country, levels=rev(sort(unique(country.summary.simple$Geo_Country)))) 
```


```{r}
major.lineage.country.summary.simple$TPA_Lineage <- factor(major.lineage.country.summary.simple$TPA_Lineage, levels=unique(major.lineage.country.summary.simple$TPA_Lineage))

p.majorlineage.country.props <- ggplot(major.lineage.country.summary.simple, aes(Geo_Country, total.samples, fill=TPA_Lineage)) +
  geom_bar(stat="identity",position = position_fill(reverse = TRUE)) +
  theme_light() + 
  scale_fill_manual(values=c("royalblue2","indianred1")) + 
  scale_y_continuous(breaks=c(0,0.5,1)) +
  coord_flip() +
  theme.text.size +
  labs(y="Proportion", x="Country")
```


```{r}
p.majorlineage.country.counts <- ggplot(major.lineage.country.summary.simple, aes(Geo_Country, total.samples, fill=TPA_Lineage)) + 
         geom_bar(stat="identity",position="stack") +
  theme_light() + 
  scale_fill_manual(values=c("royalblue2","indianred1")) +
  #scale_y_log10() +
  coord_flip() +
  labs(fill="TPA Lineage", y="Sample Count") + 
  geom_text(data=country.summary.simple, aes(Geo_Country, (total.samples+22), label=total.samples), size=2.5, inherit.aes = F) +
  theme.text.size
#p.majorlineage.country.counts
```


```{r}
major.lineage.country.summary.simple2 <- major.lineage.country.summary.simple
major.lineage.country.summary.simple2$Geo_Country <- factor(major.lineage.country.summary.simple2$Geo_Country,levels=continental.country.cols.brew2$Geo_Country) 

p.majorlineage.country.props.reordered <- ggplot(major.lineage.country.summary.simple2, aes(Geo_Country, total.samples, fill=TPA_Lineage)) + 
  #geom_bar(stat="identity",position="fill", width=0.75) +
  geom_bar(stat="identity",position = position_fill(reverse=TRUE), width=0.75) +
  
  theme_light() + 
  scale_fill_manual(values=TPA_Lineage.cols$Lineage.col, breaks=TPA_Lineage.cols$Lineage) + 
  scale_y_continuous(breaks=c(0,0.5,1)) +
  coord_flip() +
  theme.text.size +
  labs(y="Lineage Proportion", x="Country", fill="TPA Lineage")
#p.majorlineage.country.props.reordered
```

```{r, fig.width=6, fig.height=3, message=FALSE, warning=FALSE}
#grid.arrange(p.Country.temporal.bubbleplot, p.Country.hbarplot + y.theme.strip,p.majorlineage.country.props.reordered + y.theme.strip + theme.text.size + theme(legend.key.size = unit(0.75,"line")), ncol=3, widths=c(6,2,2))
```


Major lineage bubbleplot timeline
```{r, fig.width=5, fig.height=3, message=FALSE, warning=FALSE}

TPA.majorlineage.counts <- TPA.meta1.2[(TPA.meta1.2$Sample_Year!="-" & TPA.meta1.2$TPA_Lineage!="outlier"),] %>% dplyr::group_by(Sample_Year,TPA_Lineage) %>% 
  dplyr::summarise(Count=n())

TPA.majorlineage.counts$Sample_Year <- ifelse(TPA.majorlineage.counts$Sample_Year=="1960-1980","1970",TPA.majorlineage.counts$Sample_Year)

TPA.majorlineage.counts <- plyr::join(data.frame(Sample_Year=c(1912:2019),stringsAsFactors=F), TPA.majorlineage.counts)
TPA.majorlineage.counts <- TPA.majorlineage.counts[!is.na(TPA.majorlineage.counts$TPA_Lineage),]
TPA.majorlineage.counts$TPA_Lineage <- factor(TPA.majorlineage.counts$TPA_Lineage, levels=rev(TPA_Lineage.cols$Lineage))


p.majorlineage.temporal.bubbleplot <- ggplot(TPA.majorlineage.counts, aes(Sample_Year, TPA_Lineage, colour=TPA_Lineage)) +
  geom_point(alpha=0.65, aes(size=Count)) + 
  theme_light() +
  scale_size_area(max_size = 8,breaks=c(1,5,10,25,50,75,100)) +
  #scale_color_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  scale_color_manual(name="Lineage",values=TPA_Lineage.cols$Lineage.col, breaks=TPA_Lineage.cols$Lineage) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none') +
  coord_cartesian(xlim=c(1950,2020)) +
  labs(y="Lineage", x="Sample Year")
p.majorlineage.temporal.bubbleplot

p.majorlineage.temporal.bubbleplot.legend <- get_legend(p.majorlineage.temporal.bubbleplot + theme(legend.key.size = unit(0.65,"line"),legend.position='left') + guides(size=FALSE))



# total counts
#TPA.majorlineage.counts.simple <- TPA.meta1.2[(TPA.meta1.2$Sample_Year!="-" & TPA.meta1.2$TPA_Lineage!="outlier"),] %>% dplyr::group_by(TPA_Lineage) %>% 
#  dplyr::summarise(Count=n())
TPA.majorlineage.counts.simple <- TPA.meta1.2 %>% dplyr::group_by(TPA_Lineage) %>% 
  dplyr::summarise(Count=n())



p.majorlineage.total.hbarplot <- ggplot(TPA.majorlineage.counts, aes(Count,TPA_Lineage, fill=TPA_Lineage)) +
  geom_barh(stat="identity", position="stack", width=0.65) + 
  theme_light() +
  scale_fill_manual(name="Lineage",values=TPA_Lineage.cols$Lineage.col, breaks=TPA_Lineage.cols$Lineage) +
  geom_text(data=TPA.majorlineage.counts.simple, aes((Count+30), TPA_Lineage, label=Count), size=2.5, inherit.aes = F) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none') +
  labs(y="Lineage", x="Count") + 
  coord_cartesian(xlim=c(0,625))
p.majorlineage.total.hbarplot

```

Look at sample dates and whether a sample was passaged or not
```{r}
TPA.meta1.2$Sample_Year.2000era <- ifelse(TPA.meta1.2$Sample_Year.num<2000,"pre2000","post1999")
TPA.meta1.2[TPA.meta1.2$Sample_Name=="DAL-1","Sample_Year.2000era"] <- "pre2000"


# Proportion of samples before and after 2000
TPA.meta1.2[!is.na(TPA.meta1.2$Sample_Year.2000era),] %>% dplyr::group_by(Sample_Year.2000era) %>%
  dplyr::summarise(count=n()) %>% 
  dplyr::mutate(perc=(count/sum(count)*100))



# Proportion of clinicals before 2000
data.frame(TPA.meta1.2[(!is.na(TPA.meta1.2$Sample_Year.2000era) & TPA.meta1.2$Sample_Year.2000era=="pre2000"),] %>% dplyr::group_by(Direct_from_clin) %>%
               dplyr::summarise(count=n()) %>% 
               dplyr::mutate(perc=(count/sum(count)*100)),stringsAsFactors = F)

# Proportion of clinicals after 1999
data.frame(TPA.meta1.2[(!is.na(TPA.meta1.2$Sample_Year.2000era) & TPA.meta1.2$Sample_Year.2000era=="post1999"),] %>% dplyr::group_by(Direct_from_clin) %>%
               dplyr::summarise(count=n()) %>% 
               dplyr::mutate(perc=(count/sum(count)*100)),stringsAsFactors = F)



```



# Do a map of sample distributions

```{r, fig.width=8, fig.height=6, message=FALSE, warning=FALSE}

#country.coords.subset1 <- data.frame(Geo_Country=unique(sublineage.country.summary.simple$Geo_Country))
country.coords.subset <- data.frame(Geo_Country=raw.country.counts$Geo_Country, stringsAsFactors = F)

country.coords.subset$name <- gsub("Czech Republic","Czechia",gsub("USA","United States",gsub("UK","United Kingdom",gsub("\\_","\\ ",country.coords.subset$Geo_Country))))

# Russia is very large - let's centre on Tuva instead (but keep the labelling for database consistency)
country.coords.subset$name <- gsub("Russia", "Tuva", country.coords.subset$name)

# Merge with published centroid locations and deduplicate
country.coords.subset <- plyr::join(country.coords.subset,CoordinateCleaner::countryref,by="name")

# Mexico centre's oddly - take the location of Mexico City instead
country.coords.subset[country.coords.subset$Geo_Country=="Mexico","centroid.lon"] <- country.coords.subset[country.coords.subset$Geo_Country=="Mexico","capital.lon"]
country.coords.subset[country.coords.subset$Geo_Country=="Mexico","centroid.lat"] <- country.coords.subset[country.coords.subset$Geo_Country=="Mexico","capital.lat"]

country.coords.subset <- country.coords.subset[!duplicated(country.coords.subset$name),c("Geo_Country","name","centroid.lon","centroid.lat")]

# Merge with country sample counts
country.coords.subset.counts <- plyr::join(country.coords.subset,raw.country.counts,by="Geo_Country")


# ggmap
world.gps.bounds <- c(left=-120, bottom=-45, right= 150, top= 72)
stamanmap.global1 <- ggmap::get_stamenmap(bbox=world.gps.bounds, maptype = "toner-lite", zoom=3)

# Reduce the intensity of the basemap (third of the alpha)
stamanmap.global1.attribs <- attributes(stamanmap.global1)
stamanmap.global1.transparent <- matrix(adjustcolor(stamanmap.global1, alpha.f = 0.5),nrow=nrow(stamanmap.global1))
attributes(stamanmap.global1.transparent) <- stamanmap.global1.attribs 


# Plot map with country sampling
stamanmap.global1.p <- ggmap(stamanmap.global1.transparent)
stamanmap.global1.p <- stamanmap.global1.p + 
  #geom_point(data=country.coords.subset.counts, aes(centroid.lon, centroid.lat, size=Count+0.5),alpha=0.50, show.legend = F) + 
  geom_point(data=country.coords.subset.counts, aes(centroid.lon, centroid.lat, size=Count, color=Geo_Country),alpha=0.8) +
  guides(colour=FALSE) +
  theme_light() +
  theme(legend.key.size = unit(0.65,"line"),legend.position='right', ) + guides(fill=guide_legend(ncol=3)) +
  theme.text.size +
  scale_color_manual(values=continental.country.cols.brew2$country.col,breaks=continental.country.cols.brew2$Geo_Country) +
  #scale_size_area(max_size = 12,breaks=c(1,5,10,25,50,100,200,400)) +
  #scale_size_binned(breaks=c(1,5,10,25,50,100,200,400)) +
  scale_size_binned(range = c(2, 10),breaks=c(5,20,50,100,200,400)) +
  labs(y="Latitude", x="Longitude", color="Country", size="Sample\nCount")



#Cairo::Cairo(file=paste0(Figure_output_directory, "Global-TPA.low-cov__Map-of-Country-distributions11-2020.svg"), width = 800, height = 500,type="svg",units = "pt")
stamanmap.global1.p
#dev.off()

#stamanmap.global1.p
```


Plot all together as a combined panel grid
```{r, fig.width=10, fig.height=9, message=FALSE, warning=FALSE}
p.bubble.legends.grid <- plot_grid(p.majorlineage.temporal.bubbleplot.legend, p.Country.temporal.bubbleplot.legend, ncol=1, rel_heights=c(1,3))

first_row_country.dist <- plot_grid(stamanmap.global1.p,labels=c('A'),ncol=1,label_size = 11,vjust=-0.25)

row2.3_column_1_country.dist <- plot_grid(p.Country.temporal.bubbleplot, p.majorlineage.temporal.bubbleplot, ncol=1, rel_heights=c(4,1), align=T, labels=c('B','E'),label_size = 11,vjust=-0.25)

row2.3_column_2_country.dist <- plot_grid(p.Country.hbarplot + y.theme.strip + coord_cartesian(x=c(0,620)), p.majorlineage.total.hbarplot + theme(legend.position="none") + y.theme.strip + theme.text.size + coord_cartesian(x=c(0,620)) + labs(x="Samples/Lineage"),ncol=1, rel_heights=c(4,1), align=T, labels=c('C','F'),label_size = 11,vjust=-0.25)

row2.3_column_3_country.dist <- plot_grid(p.majorlineage.country.props.reordered + y.theme.strip + theme.text.size + theme(legend.position="none"), NULL, ncol=1, rel_heights=c(4,1),labels=c('D',''),label_size = 11,vjust=-0.25)

row2.3_combine.columns_country.dist <- plot_grid(row2.3_column_1_country.dist, row2.3_column_2_country.dist, row2.3_column_3_country.dist,p.bubble.legends.grid,rel_widths=c(5,2,1,2), ncol=4)

gg_all_country.dist.complex <- plot_grid(first_row_country.dist, row2.3_combine.columns_country.dist, labels=c('', ''), ncol=1, rel_heights = c(6,5), scale=0.95)

#gg_all_country.dist.complex
```



```{r, fig.width=10, fig.height=10, message=FALSE, warning=FALSE}

#Cairo::Cairo(file=paste0(Figure_output_directory, "Figure1__Country-distros_complex_02-2021.svg"), width = 1000, height = 800,type="svg",units = "pt")
gg_all_country.dist.complex
#dev.off()
```


###
# Now move onto detailed gubbins masked phylogeny and sublineage analysis


```{r}
#TPA.MLtree
#TPA.pyjar.tree

#ggtree(TPA.MLtree)
#ggtree(TPA.pyjar.tree)

# Extract SNP distances from pyjar tree
edge.TPAgubbins <- data.frame(TPA.pyjar.tree$edge, edge_num=1:length(TPA.pyjar.tree$edge.length),stringsAsFactors = F)
colnames(edge.TPAgubbins)=c("parent", "node", "edge_num")
edge.TPAgubbins$SNPs <- TPA.pyjar.tree$edge.length

# now build tree
TPA.pyjar.treeplot <- ggtree(TPA.pyjar.tree) %<+% 
  edge.TPAgubbins + geom_text(aes(x=branch, label=SNPs, vjust=-.5),size=3,color="grey50") +
  #geom_tiplab(size=2,align=T,offset=.0001) +
  NULL
TPA.pyjar.treeplot 
```

# Do some clustering using rPinecone (takes a long time, so write out to file and reimport)

```{r}
#TPA.pyjar.tree.phylo <- as.phylo(TPA.pyjar.tree)
#pinecone.output <- rPinecone::pinecone(TPA.pyjar.tree.phylo,10,3) # standard approach used for TPA

######################## testing ########################
#pinecone.output.8.3 <- rPinecone::pinecone(TPA.pyjar.tree.phylo,8,3) # modifying to see the effect 8.3
#pinecone.output.12.3 <- rPinecone::pinecone(TPA.pyjar.tree.phylo,12,3) # modifying to see the effect 12.3
#pinecone.output <- pinecone.output.12.3
######################## testing ########################

#pinecone.clusters <- data.frame(pinecone.output$table,stringsAsFactors = T)
#pinecone.clusters$Sub.lineage.sing <- gsub("Singleton\\_.+","Singleton",pinecone.clusters$Sub.lineage,perl=T)
#write.csv(pinecone.clusters, file=paste0(Figure_output_directory, "Global-TPA.goodcov.rPinecone10-3.assignments_2020-11-11.csv"))

```

Alternatively, use a bootstrapped version of rPinecone clusters (run on command line due to the time taken to iterate over 100 trees)
Testing external pinecone (took about 3 hrs to run rPinecone over 100 bootstrap trees for the full dataset)
```{r}

pinecone.10 <- read.csv(pinecone.10.file, stringsAsFactors=F)

length(unique(pinecone.10$Sub.lineage))

length(unique(pinecone.10$pinecone_95))
length(unique(pinecone.10$pinecone_80))
length(unique(pinecone.10$pinecone_50))
length(unique(pinecone.10$pinecone_20))
length(unique(pinecone.10$pinecone_5))

plot__pinecone.bootstraps <- function(external.pinecone.clustering) {
  gg <- ggtree(TPA.MLtree)
  f2 <- facet_plot(gg, panel = "rPinecone ML tree", data = external.pinecone.clustering, geom = geom_tile, 
                   aes(x = as.numeric(factor(Sub.lineage))), fill = "blue")
  f2 <- facet_plot(f2, panel = "rPinecone 95%", data = external.pinecone.clustering, geom = geom_tile, 
                   aes(x = as.numeric(factor(pinecone_95))), fill = "red")
  f2 <- facet_plot(f2, panel = "rPinecone 80%", data = external.pinecone.clustering, geom = geom_tile, 
                   aes(x = as.numeric(factor(pinecone_80))), fill = "blue")
  f2 <- facet_plot(f2, panel = "rPinecone 50%", data = external.pinecone.clustering, geom = geom_tile, 
                   aes(x = as.numeric(factor(pinecone_50))), fill = "red")
  f2 <- facet_plot(f2, panel = "rPinecone 20%", data = external.pinecone.clustering, geom = geom_tile, 
                   aes(x = as.numeric(factor(pinecone_20))), fill = "blue")
  f2 <- facet_plot(f2, panel = "rPinecone 5%", data = external.pinecone.clustering, geom = geom_tile, 
                   aes(x = as.numeric(factor(pinecone_5))), fill = "red") +
    theme(strip.background =element_rect(fill="white"),strip.text = element_text(size= 8)) +
    theme.text.size
  return(f2)
}


p.pinecone.boootstrap.cluster.eval <- plot__pinecone.bootstraps(pinecone.10)

#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure3_Global-TPA.good-MLtree_vs_rPinecone-thresholds_02-2021.svg"), width = 600, height = 600,type="svg",units = "pt")
p.pinecone.boootstrap.cluster.eval
#dev.off()

```

rPinecone clusters are dependent on both SNP distance and tree topology. Bootstrapping these clusters is very hard, because the data are so clonal - particularly at the top of the tree. Removing just a few columns for a bootstrap can change the topology of the very clonal cluster. However, it is clear that many of these clusters elsewhere are still valid and reproducible at even very stringent support criteria. Therefore, for consistency, decided to require the same clusters to be present in at least 5% of trees - this robust allows amalgamation of the top clade into a coherent group. 

Since paper was largely analysed using non-bootstrapped clusters, will aim to use the same ordering scheme (but only use robust clusters).
```{r}
pinecone.10.3__5pc <- unique(pinecone.10[,c("Sub.lineage","pinecone_5")])

# want to rename bootstrapped clusters to be reasonably consistent with consensus tree analysis, and ensure sensible order on tree)
pinecone.10.3__5pc <- data.frame(pinecone.10 %>% dplyr::group_by(pinecone_5) %>% summarise(count=n()))
pinecone.10.3__5pc$is.singleton <- ifelse(pinecone.10.3__5pc$count==1,"singleton", "multi")
pinecone.10.3__5pc <- plyr::join(pinecone.10.3__5pc, unique(pinecone.10[,c("Sub.lineage","pinecone_5")]), by="pinecone_5", type='full')
pinecone.10.3__5pc$Sub.lineage.sing <- sapply(1:nrow(pinecone.10.3__5pc), function(x) ifelse(grepl("singleton",pinecone.10.3__5pc$Sub.lineage[x]),23,pinecone.10.3__5pc$Sub.lineage[x]))
pinecone.10.3__5pc <- pinecone.10.3__5pc[order(as.numeric(pinecone.10.3__5pc$Sub.lineage.sing),pinecone.10.3__5pc$Sub.lineage),]

# Now extract non-singletons and rename in order
new.names <- pinecone.10.3__5pc[pinecone.10.3__5pc$Sub.lineage.sing!=23,]
new.names <- data.frame(pinecone_5=unique(new.names$pinecone_5), stringsAsFactors = F)
new.names$pinecone_5_newname <- c(1:nrow(new.names))
# Now extract singletons and rename 
new.names.sing <- pinecone.10.3__5pc[pinecone.10.3__5pc$is.singleton=="singleton",]
new.names.sing <- data.frame(pinecone_5=unique(new.names.sing$pinecone_5), stringsAsFactors = F)
new.names.sing$pinecone_5_newname <- "Singleton"
# Combine new name list
new.names <- rbind(new.names,new.names.sing)
# integrate into list of types
pinecone.10.3__5pc <- plyr::join(pinecone.10.3__5pc, new.names, by="pinecone_5", type='left')

# now apply back to samples 
pinecone.10 <- plyr::join(pinecone.10, (unique(pinecone.10.3__5pc[,c("pinecone_5","pinecone_5_newname")])), by="pinecone_5", type="left")
pinecone.10$pinecone_5_newname.numeric <- as.numeric(sapply(1:nrow(pinecone.10), function(x) ifelse(grepl("Singleton",pinecone.10$pinecone_5_newname[x]),18,pinecone.10$pinecone_5_newname[x])))



```


re-import rPinecone classifications (original analysis)
```{r}

pinecone.clusters2 <- read.csv(pinecone.clusters.MLoriginal.file, row.names=1, comment.char="", check.names=F)



pinecone.clusters <- data.frame(pinecone.clusters2, stringsAsFactors = F)

#TPA.meta1.2
colnames(pinecone.clusters) <- c("Sample_Name", "pinecone.sublin.raw","pinecone.major.lin.raw", "TPA.pinecone.sublineage.pyjar")

# do some relabelling of major lineages
pinecone.clusters$TPA.pinecone.major <- ifelse(pinecone.clusters$pinecone.major.lin.raw=="0","outlier", ifelse(pinecone.clusters$pinecone.major.lin.raw=="1", "SS14", "Nichols"))

```

# Integrate pinecone data with full meta
```{r}
TPA.meta1.2.pinecone <- plyr::join(pinecone.clusters[,c("Sample_Name","TPA.pinecone.sublineage.pyjar","TPA.pinecone.major")], TPA.meta1.2, by="Sample_Name", type="left")

TPA.meta1.2.pinecone <- plyr::join(data.frame(Sample_Name=pinecone.10$Taxa, TPA.pinecone.sublineage=pinecone.10$pinecone_5_newname, stringsAsFactors = F),TPA.meta1.2.pinecone, by="Sample_Name", type="left")
```

```{r}
# How many Nichols are there?
nrow(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA_Lineage=="Nichols",])

# How many SS14 are there?
nrow(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA_Lineage=="SS14",])

# How many outliers are there? (these are technically SS14/Nichols, but also basal in the phylogeny)
nrow(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.major=="outlier",])

# what are the outlier (non-SS14/Nihols) genomes?
TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.major=="outlier",]

# How many Singletons are there?
nrow(TPA.meta1.2.pinecone[grepl("Singleton",TPA.meta1.2.pinecone$TPA.pinecone.sublineage),])
(TPA.meta1.2.pinecone[grepl("Singleton",TPA.meta1.2.pinecone$TPA.pinecone.sublineage),])

# How many Nichols Singletons are there?
nrow(TPA.meta1.2.pinecone[(grepl("Singleton",TPA.meta1.2.pinecone$TPA.pinecone.sublineage) & TPA.meta1.2.pinecone$TPA_Lineage=="Nichols"),])

# How many SS14 Singletons are there?
nrow(TPA.meta1.2.pinecone[(grepl("Singleton",TPA.meta1.2.pinecone$TPA.pinecone.sublineage) & TPA.meta1.2.pinecone$TPA_Lineage=="SS14"),])
(TPA.meta1.2.pinecone[(grepl("Singleton",TPA.meta1.2.pinecone$TPA.pinecone.sublineage) & TPA.meta1.2.pinecone$TPA_Lineage=="SS14"),])

```

Of the countries that have both Nichols and SS14, what is the breakdown?
```{r}
# Which countries have both SS14 and Nichols?
Countries.both.lineages <- data.frame(unique(TPA.meta1.2[,c("Geo_Country","TPA_Lineage")]) %>% dplyr::group_by(Geo_Country) %>% dplyr::summarise(count=n()))
Countries.both.lineages <- Countries.both.lineages[Countries.both.lineages$count==2,"Geo_Country"]

# Proportion of Nichols/SS14 in this subset of countries
TPA.meta1.2[TPA.meta1.2$Geo_Country %in% Countries.both.lineages,] %>% 
  dplyr::group_by(TPA_Lineage) %>%
  summarise(count=n()) %>%
  mutate(percentage=(count/sum(count))*100)

# and by country
Lineage.perc.country <- data.frame(TPA.meta1.2[TPA.meta1.2$Geo_Country %in% Countries.both.lineages,] %>% 
  dplyr::group_by(Geo_Country,TPA_Lineage) %>%
  summarise(count=n()) %>%
  mutate(percentage=(count/sum(count))*100),stringsAsFactors = F)
Lineage.perc.country

median(Lineage.perc.country[Lineage.perc.country$TPA_Lineage=="SS14","percentage"])
```



Description of the new dataset
```{r}
# Total samples in good tree
nrow(TPA.meta1.2.pinecone)
# Total 'new' samples in good tree
unique(TPA.meta1.2.pinecone$Citation)
nrow(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Citation %in% c("unpublished-WSI" ,"unpublished-Taiaroa"),])
nrow(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Citation %notin% c("unpublished-WSI" ,"unpublished-Taiaroa"),])

nrow(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Citation %in% c("This_Study"),])
nrow(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Citation %notin% c("This_Study"),])
```



Define colours for sublineages
```{r}
# Define sublineage clustering scheme using brew colourscales
#sublineages.cols.brew <- data.frame(unique(TPA.meta1.2.pinecone[,c("TPA.pinecone.major","TPA.pinecone.sublineage.pyjar")]), stringsAsFactors = F)
sublineages.cols.brew <- data.frame(unique(TPA.meta1.2.pinecone[,c("TPA.pinecone.major","TPA.pinecone.sublineage")]), stringsAsFactors = F)

sublineages.cols.brew <- sublineages.cols.brew[order(sublineages.cols.brew$TPA.pinecone.major,sublineages.cols.brew$TPA.pinecone.sublineage),]

sublineages.cols.brew$sublin.order <- as.numeric(as.character(sublineages.cols.brew$TPA.pinecone.sublineage))
sublineages.cols.brew <- sublineages.cols.brew[order(sublineages.cols.brew$sublin.order),]



#sublineages.cols.brew$sublineage.cols <- c(brewer.pal(n=7,"Blues")[2:7],brewer.pal(n=6,"Purples")[2:6],"grey80",brewer.pal(n=4,"YlOrBr")[c(2,3)], brewer.pal(n=7,"Reds")[2:6],brewer.pal(n=5,"Greens")[2:5],"grey80") 

# For revised bootstrapped clusters
sublineages.cols.brew$sublineage.cols <- sublineages.cols.brew$sublineage.cols <- c("#FC9272","#EF3B2C",brewer.pal(n=4,"Greens")[2:4],brewer.pal(n=4,"YlOrBr")[c(2,3)],brewer.pal(n=6,"Blues")[2:6],brewer.pal(n=6,"Purples")[2:6],"grey80","grey80")
  


sublineages.cols.brew <- unique(sublineages.cols.brew[,c("TPA.pinecone.sublineage","sublineage.cols")])
sublineages.cols.brew <- sublineages.cols.brew[order(as.numeric(as.character(sublineages.cols.brew$TPA.pinecone.sublineage))),]
sublineages.cols.brew$TPA.pinecone.sublineage <- factor(sublineages.cols.brew$TPA.pinecone.sublineage, levels=sublineages.cols.brew$TPA.pinecone.sublineage)

colnames(sublineages.cols.brew) <- c("sublineage","sublineage.cols")
sublineages.cols.brew <- unique(sublineages.cols.brew)

```


now plot trees
```{r}
#TPA.MLtree.ggtree <- ggtree(TPA.MLtree,layout = "fan",open.angle = 20, right=T)
#TPA.pyjar.treeplot 

p.TPA.pyjar.treeplot.tipsublineages <- TPA.pyjar.treeplot %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F) + 
  geom_tippoint(aes(color=Sublineage), size=1, alpha=0.5) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) 

p.TPA.pyjar.treeplot.stripsublineages <- gheatmap(TPA.pyjar.treeplot,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F), color=NULL,width=0.1,offset=0.0000005, colnames_angle=-45,colnames_offset_y=0.25, font.size=3) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) 
p.TPA.pyjar.treeplot.stripsublineages

```

Now for ML gubbins tree
```{r}
TPA.MLtree.ggtree <- ggtree(TPA.MLtree,layout = "fan",open.angle = 20, right=T)

TPA.MLtree.ggtree.tippoint <- TPA.MLtree.ggtree %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F) + 
  geom_tippoint(aes(color=Sublineage), size=0.75, alpha=0.5) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage)

p.TPA.MLtree.sublineages <- gheatmap(TPA.MLtree.ggtree.tippoint,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F), color=NULL,width=0.075,offset=0.00000025, colnames_angle=-45,colnames_offset_y=0.02, hjust=-0.0,font.size=2.25) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right')

#p.TPA.MLtree.sublineages


#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure2__goodcov-MLtree_circular__02-2021.svg"), width = 800, height = 800,type="svg",units = "pt")
p.TPA.MLtree.sublineages
#dev.off()


```


or a linear tree (which can also capture legend from) 
```{r}
TPA.ML.ggtree.linear <- ggtree(TPA.MLtree) %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F) + 
  geom_tippoint(aes(color=Sublineage), size=0.75, alpha=0.5, show.legend = F) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  # add bootstrap support
  geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=1.5, shape=18) +
  #geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=2.5, shape=18, alpha=0.5) +
  #geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=1.5, shape=18, color='cyan', alpha=0.5) +
  NULL

p.TPA.ML.ggtree.linear <- gheatmap(TPA.ML.ggtree.linear,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F), color=NULL,width=0.075,offset=0.00000025, colnames_angle=0,colnames_offset_y=-1, font.size=2) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.75,"line"),legend.position='bottom')

p.TPA.ML.ggtree.linear <- p.TPA.ML.ggtree.linear + new_scale_fill()

p.TPA.ML.ggtree.linear <- gheatmap(p.TPA.ML.ggtree.linear,TPA.rawseq.countries.p, color=NULL,width=0.075,offset=0.00000725, colnames_angle=0,colnames_offset_y=-1, font.size=2) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='bottom') + 
  geom_treescale(fontsize = 2.5, x=0.000001, y=50) +
  NULL

p.TPA.ML.ggtree.linear

# Capture legend from full tree
p.TPA.ML.ggtree.linear.legend <- get_legend(p.TPA.ML.ggtree.linear)

```

# Define subtrees using 'collapse' clade
```{r}

#ggtree(TPA.MLtree,layout = "fan",open.angle = 20, right=T) + geom_text2(aes(subset=!isTip, label=node), hjust=-.3) 
#ggtree(TPA.MLtree) + geom_text2(aes(subset=!isTip, label=node), hjust=-.3, size=3) 

SS14.subtree.nodeid <- 530 # 529
Nichols.subtree.nodeid <- 955 


```



Nichols subtree (with collapsed SS14)
```{r, fig.width=4, fig.height=5, message=FALSE, warning=FALSE}

#TPA.ML.ggtree.linear + geom_text2(aes(subset=!isTip, label=node), hjust=-.3, size=3) 

Nichols.coll <- ggtree(TPA.MLtree) %>% collapse(node=SS14.subtree.nodeid)

# Collaping node reduces 'y' position, so lets add some back for proper spacing
Nichols.coll$data[Nichols.coll$data$node==SS14.subtree.nodeid,"y"] <- Nichols.coll$data[Nichols.coll$data$node==SS14.subtree.nodeid,"y"] + 10

# now add triangle and text
Nichols.coll <- Nichols.coll + geom_text2(aes(subset=(node == SS14.subtree.nodeid)), size=20, label=intToUtf8(9664), hjust=0.2,vjust=.55, family="OpenSansEmoji", color="indianred1", alpha=.75)
Nichols.coll <- Nichols.coll + geom_text2(aes(subset=(node == SS14.subtree.nodeid)), cex=3, vjust=0.2, label="SS14",hjust = -1.5)
Nichols.coll <- Nichols.coll %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F) + 
  geom_tippoint(aes(color=Sublineage), size=1, alpha=0.25,show.legend=F) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  # add bootstrap support
  geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=1.5, shape=18) +
  #geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=2.5, shape=18, alpha=0.5) +
  #geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=1.5, shape=18, color="cyan", alpha=0.5) +
  NULL


p.TPA.Nichols.coll <- gheatmap(Nichols.coll,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F), color=NULL,width=0.085,offset=0.00000025, colnames_angle=0,colnames_offset_y=-1, font.size=2) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.85,"line"),legend.position='none')

p.TPA.Nichols.coll <- p.TPA.Nichols.coll + new_scale_fill()

p.TPA.Nichols.coll <- gheatmap(p.TPA.Nichols.coll,TPA.rawseq.countries.p, color=NULL,width=0.085,offset=0.00001025, colnames_angle=0,colnames_offset_y=-1, font.size=2) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size + theme(legend.key.size = unit(0.75,"line"),legend.position='none') +
  geom_treescale(fontsize = 2.5, x=0.000002, y=25) + 
  ylim(-3,116) +
  #ggtitle("Nichols-lineage phylogeny") +
  NULL
p.TPA.Nichols.coll

```


SS14 subtree (with collapsed Nichols)
```{r, fig.width=4, fig.height=5, message=FALSE, warning=FALSE}

#TPA.ML.ggtree.linear + geom_text2(aes(subset=!isTip, label=node), hjust=-.3, size=3) 

SS14.coll <- ggtree(TPA.MLtree) %>% collapse(node=Nichols.subtree.nodeid)

# Collaping node reduces 'y' position, so lets add some back for proper spacing
SS14.coll$data[SS14.coll$data$node==Nichols.subtree.nodeid,"y"] <- SS14.coll$data[SS14.coll$data$node==Nichols.subtree.nodeid,"y"] - 25

SS14.coll <- SS14.coll + geom_text2(aes(subset=(node == Nichols.subtree.nodeid)), size=20, label=intToUtf8(9664), hjust=0.2,vjust=.55, family="OpenSansEmoji", color="royalblue2", alpha=.85)
SS14.coll <- SS14.coll + geom_text2(aes(subset=(node == Nichols.subtree.nodeid)), cex=3, vjust=0.2, label="Nichols",hjust = -1)
SS14.coll <- SS14.coll %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F) + 
  geom_tippoint(aes(color=Sublineage), size=1, alpha=0.25,show.legend=F) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  # add bootstrap support
  geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=1.5, shape=18) +
  #geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=2.0, shape=18, alpha=0.25) +
  #geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=1.5, shape=18, color="cyan", alpha=0.5) +
  NULL
#SS14.coll


p.TPA.SS14.coll <- gheatmap(SS14.coll,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage,stringsAsFactors = F), color=NULL,width=0.085,offset=0.00000025, colnames_angle=0,colnames_offset_y=-1, font.size=2) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none')

p.TPA.SS14.coll <- p.TPA.SS14.coll + new_scale_fill()

p.TPA.SS14.coll <- gheatmap(p.TPA.SS14.coll,TPA.rawseq.countries.p, color=NULL,width=0.085,offset=0.00001025, colnames_angle=0,colnames_offset_y=-1, font.size=2) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none') +
  geom_treescale(fontsize = 2.5, x=0.000002, y=65) +
  #ggtitle("SS14-lineage phylogeny") +
  ylim(-30,429) +
  NULL

p.TPA.SS14.coll

```

Plot collapsed trees together
```{r, fig.width=9, fig.height=7, message=FALSE, warning=FALSE}

# make Nichols tree shorter
coll.trees.nichols.shorter <- plot_grid(NULL, p.TPA.Nichols.coll, NULL, ncol=1, rel_heights=c(2,7,1))

#coll.trees.row.1 <- plot_grid(p.TPA.Nichols.coll,p.TPA.SS14.coll,ncol=2, rel_widths=c(1,1), labels=c('Nichols','SS14'),label_size = 11)
#coll.trees.row.1 <- plot_grid(p.TPA.Nichols.coll,p.TPA.SS14.coll,ncol=2, rel_widths=c(1,1), labels=c('A','B'),label_size = 11)

#coll.trees.row.1 <- plot_grid(coll.trees.nichols.shorter,p.TPA.SS14.coll,ncol=2, rel_widths=c(1,1), labels=c('A','B'),label_size = 11)
coll.trees.row.1 <- plot_grid(p.TPA.SS14.coll,coll.trees.nichols.shorter,ncol=2, rel_widths=c(1,1), labels=c('A - SS14-lineage phylogeny','B - Nichols-lineage phylogeny'),label_size = 11)


coll.trees.row.2 <- plot_grid(p.TPA.ML.ggtree.linear.legend,ncol=1,labels='Key',label_size = 11)
coll.trees.combined <- plot_grid(coll.trees.row.1, coll.trees.row.2, ncol=1, rel_heights=c(3,1), scale=0.95)

coll.trees.combined
```


Sublineage timeline bubbleplot
```{r, fig.width=5, fig.height=3, message=FALSE, warning=FALSE}

#TPA.sublineages.counts <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Sample_Year!="-" & TPA.meta1.2.pinecone$TPA.pinecone.sublineage!="Singleton"),] %>% dplyr::group_by(Sample_Year, TPA.pinecone.sublineage) %>% 
#  dplyr::summarise(Count=n())

TPA.sublineages.counts <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Sample_Year!="-" ),] %>% dplyr::group_by(Sample_Year, TPA.pinecone.sublineage) %>% 
  dplyr::summarise(Count=n())



TPA.sublineages.counts$Sample_Year <- ifelse(TPA.sublineages.counts$Sample_Year=="1960-1980","1970",TPA.sublineages.counts$Sample_Year)

TPA.sublineages.counts <- plyr::join(data.frame(Sample_Year=c(1912:2019),stringsAsFactors=F), TPA.sublineages.counts)
TPA.sublineages.counts <- TPA.sublineages.counts[!is.na(TPA.sublineages.counts$TPA.pinecone.sublineage),]
TPA.sublineages.counts$TPA.pinecone.sublineage <- factor(TPA.sublineages.counts$TPA.pinecone.sublineage, levels=rev(sublineages.cols.brew$sublineage))


p.sublineage.temporal.bubbleplot <- ggplot(TPA.sublineages.counts, aes(Sample_Year, TPA.pinecone.sublineage, colour=TPA.pinecone.sublineage)) +
  geom_point(alpha=0.65, aes(size=Count)) + 
  guides(colour=FALSE) +
  theme_light() +
  scale_size_area(max_size = 8,breaks=c(1,5,10,25,50,75,100)) +
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right') +
  coord_cartesian(xlim=c(1950,2020)) +
  labs(y="Sublineage", x="Sample Year")

p.sublineage.temporal.bubbleplot.legend <- get_legend(p.sublineage.temporal.bubbleplot)

p.sublineage.temporal.bubbleplot
```


```{r}
#sublineage.counts <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Sample_Year!="-" & TPA.meta1.2.pinecone$TPA.pinecone.sublineage!="Singleton"),] %>% group_by(TPA.pinecone.sublineage) %>% summarise(Count=n())
sublineage.counts <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Sample_Year!="-" ),] %>% group_by(TPA.pinecone.sublineage) %>% summarise(Count=n())


sublineage.counts$TPA.pinecone.sublineage <- factor(sublineage.counts$TPA.pinecone.sublineage,levels=rev(sublineages.cols.brew$sublineage))

p.sublineage.hbarplot <- ggplot(sublineage.counts, aes(Count,TPA.pinecone.sublineage,fill=TPA.pinecone.sublineage)) +
  geom_barh(stat="identity", position="stack", width=0.75) +
  theme_light() +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none') +
  #scale_fill_manual(name="Country",values=sublineage.counts$TPA.pinecone.sublineage, breaks=sublineage.counts$Geo_Country) +
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  geom_text(data=sublineage.counts, aes((Count+10), TPA.pinecone.sublineage,label=Count), size=2.5, inherit.aes = F) +
  labs(y="Sublineage", x="Samples/Sublineage")
p.sublineage.hbarplot  
```
Proportion by country
```{r}
#sublineage.country.counts <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Sample_Year!="-" & TPA.meta1.2.pinecone$TPA.pinecone.sublineage!="Singleton"),] %>% group_by(TPA.pinecone.sublineage, Geo_Country) %>% summarise(Count=n())
sublineage.country.counts <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Sample_Year!="-" ),] %>% group_by(TPA.pinecone.sublineage, Geo_Country) %>% summarise(Count=n())

sublineage.country.counts$TPA.pinecone.sublineage <- factor(sublineage.country.counts$TPA.pinecone.sublineage,levels=rev(sublineages.cols.brew$sublineage))

p.sublineage.country.hbarplot <- ggplot(sublineage.country.counts, aes(Count,TPA.pinecone.sublineage,fill=Geo_Country)) +
  geom_barh(stat="identity", position="fill", width=0.75) +
  theme_light() +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none') +
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  labs(y="Sublineage", x="Country Proportion")
p.sublineage.country.hbarplot
```


```{r, fig.width=10, fig.height=10, message=FALSE, warning=FALSE}
p.sublineage.timeline.bubbleplot.combined <- plot_grid(p.sublineage.temporal.bubbleplot + theme(legend.position="none"), p.sublineage.hbarplot + y.theme.strip, p.sublineage.country.hbarplot + y.theme.strip, p.sublineage.temporal.bubbleplot.legend, ncol=4, align=T, rel_widths=c(5,2,2,1),labels=c('C','D','E',''),label_size = 11,vjust=-0.25) #+ theme(legend.position="right")


gg.colltrees.sublineage.distributions <- plot_grid(coll.trees.combined, p.sublineage.timeline.bubbleplot.combined, ncol=1, rel_heights=c(5,2))
#gg.colltrees.sublineage.distributions


```

```{r, fig.width=10, fig.height=10, message=FALSE, warning=FALSE}

#Cairo::Cairo(file=paste0(Figure_output_directory, "Figure2__goodcov-MLtree_+sublineage-distros__02-2021.svg"), width = 900, height = 900,type="svg",units = "pt")
gg.colltrees.sublineage.distributions
#dev.off()
```



# Now take a deep dive into Single country/region sublineage dynamics (UK-wide v.s. British Columbia, Canada)

```{r}

TPA.sublineage_UK.Canada.temporal <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Geo_Country=="UK" | TPA.meta1.2.pinecone$Geo_Region=="British_Columbia"),]

TPA.sublineage_UK.Canada.temporal.counts <- data.frame(TPA.sublineage_UK.Canada.temporal %>% dplyr::group_by(TPA.pinecone.sublineage,Sample_Year,Geo_Country,TPA.pinecone.major) %>% 
  dplyr::summarise(Sample.Count=n()), stringsAsFactors = F)


# Fix dates and make continuous
TPA.sublineage_UK.Canada.temporal.counts$Sample_Year <- as.numeric(TPA.sublineage_UK.Canada.temporal.counts$Sample_Year)


TPA.sublineage_UK.Canada.temporal.counts <- plyr::join(data.frame(Sample_Year=c(1912:2019),stringsAsFactors=F),TPA.sublineage_UK.Canada.temporal.counts, by="Sample_Year", type="left")

TPA.sublineage_UK.Canada.temporal.counts <- TPA.sublineage_UK.Canada.temporal.counts[!is.na(TPA.sublineage_UK.Canada.temporal.counts$TPA.pinecone.sublineage),]

# order by sublineage
TPA.sublineage_UK.Canada.temporal.counts$TPA.pinecone.sublineage <- factor(TPA.sublineage_UK.Canada.temporal.counts$TPA.pinecone.sublineage, levels=rev(sublineages.cols.brew$sublineage))

# make bubbleplot
plot.TPA.sublineage_UK.Canada.temporal.counts.bubbleplot <- 
  ggplot(TPA.sublineage_UK.Canada.temporal.counts, aes(Sample_Year, TPA.pinecone.sublineage, color=TPA.pinecone.sublineage)) +
  geom_point(alpha=0.70,aes(size=Sample.Count)) + 
  geom_line(alpha=0.25) +
  theme_light() +
  labs(x="Sample Year", y="Sublineage", size="Sample Count") +
  coord_cartesian(xlim=c(2000,2020)) +
  theme(strip.text.y = element_text(angle = 0)) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  facet_grid(.~Geo_Country) + 
  scale_size_area(max_size = 10,breaks=c(1,5,10,20,30,40,50)) +
  theme.text.size + 
  theme(legend.key.size = unit(0.65,"line")) +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.x = element_text(color = "grey25")) +
  NULL
plot.TPA.sublineage_UK.Canada.temporal.counts.bubbleplot 


```

get population prevalence stats
```{r}

UK.stats <- read.table(UK.stats.file,sep="\t",header=T)
UK.stats$Country <- "UK"


BC.stats <- read.table(BC.stats.file,sep="\t",header=T)
BC.stats$Country <- "British Columbia"

UK.BC.stats.combined <- rbind(UK.stats,BC.stats)
UK.BC.stats.combined[UK.BC.stats.combined$Country=="UK","Country"] <- "England"


plot.UK.BC.stats.combined <- ggplot(UK.BC.stats.combined, aes(Year,Total)) + 
  geom_line() +
  theme_light() +
  labs(x="Year",y="Syphilis Incidence/100,000") + 
  #scale_x_continuous(breaks=seq(2008,2019,2)) +
  coord_cartesian(xlim=c(2000,2020)) +
  #ggtitle("Syphilis Incidence data") +
  facet_grid(.~Country) +
  theme.text.size +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.x = element_text(color = "grey25", size=10))
plot.UK.BC.stats.combined

```

```{r, fig.width=5.5, fig.height=4.5, message=FALSE, warning=FALSE}
plot_grid(plot.UK.BC.stats.combined + x.theme.strip + ggtitle("Syphilis Incidence and sublineage count"), plot.TPA.sublineage_UK.Canada.temporal.counts.bubbleplot + theme(legend.position="bottom",strip.background = element_blank(),strip.text.x = element_blank()), ncol=1, align=T, rel_heights=c(1,2), labels=c('A','B'), label_size=11)

```

some stats about the canadian outbreak
```{r}
BC.sublineage.summary <- data.frame(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Geo_Region=="British_Columbia",] %>% dplyr::group_by(TPA.pinecone.sublineage,Geo_Country,TPA.pinecone.major) %>% dplyr::summarise(Sample.Count=n()), stringsAsFactors = F)

(BC.sublineage.summary[BC.sublineage.summary$TPA.pinecone.sublineage==1,"Sample.Count"]/sum(BC.sublineage.summary$Sample.Count))*100

BC.sublineage.summary.pre2010 <- data.frame(TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Geo_Region=="British_Columbia" & TPA.meta1.2.pinecone$Sample_Year<=2010),] %>% dplyr::group_by(TPA.pinecone.sublineage,Geo_Country,TPA.pinecone.major) %>% dplyr::summarise(Sample.Count=n()), stringsAsFactors = F)

(BC.sublineage.summary.pre2010[BC.sublineage.summary.pre2010$TPA.pinecone.sublineage==1,"Sample.Count"]/sum(BC.sublineage.summary.pre2010$Sample.Count))*100

BC.sublineage.summary.post2011 <- data.frame(TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Geo_Region=="British_Columbia" & TPA.meta1.2.pinecone$Sample_Year>=2011),] %>% dplyr::group_by(TPA.pinecone.sublineage,Geo_Country,TPA.pinecone.major) %>% dplyr::summarise(Sample.Count=n()), stringsAsFactors = F)

(BC.sublineage.summary.post2011[BC.sublineage.summary.post2011$TPA.pinecone.sublineage==1,"Sample.Count"]/sum(BC.sublineage.summary.post2011$Sample.Count))*100

```
and the UK dataset
```{r}
UK.sublineage.summary <- data.frame(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Geo_Country=="UK",] %>% dplyr::group_by(TPA.pinecone.sublineage,Geo_Country,TPA.pinecone.major) %>% dplyr::summarise(Sample.Count=n()), stringsAsFactors = F)

```

\
# Now look more globally \
Sublineages per country - note that this plot does not account for multiple Singletons being in the same country - replaced.
```{r}
sublineage.count.per.country <- data.frame(unique(TPA.meta1.2.pinecone[,c("TPA.pinecone.sublineage","Geo_Country")]) %>% dplyr::group_by(Geo_Country) %>%
  dplyr::summarise(count=n()), stringsAsFactors = F)

sublineage.count.per.country$Geo_Country <- factor(sublineage.count.per.country$Geo_Country, levels=continental.country.cols.brew2$Geo_Country)


p.sublineages.per.country.bar <- ggplot(sublineage.count.per.country, aes(Geo_Country, count, fill=Geo_Country)) + 
  geom_bar(stat='identity', width=0.75) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) + 
  theme_light() +
  x.theme.axis.rotate + 
  theme.text.size +
  labs(x="Country", y="Sublineages/Country") +
  theme(legend.key.size = unit(0.65,"line"),legend.position='bottom') + 
  scale_y_continuous(breaks=seq(0,14,2))

p.sublineages.per.country.bar
```

Look at Singleton, private, and multi-country sublineages
```{r}
# How many Singleton lineages are in each country?
Singleton.country.counts <- data.frame(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage=="Singleton",c("Sample_Name","Geo_Country")] %>% 
  dplyr::group_by(Geo_Country) %>%
  dplyr::summarise(per.country=n()), stringsAsFactors = F)
Singleton.country.counts <- plyr::join(data.frame(Geo_Country=c(unique(TPA.meta1.2.pinecone$Geo_Country))), Singleton.country.counts, by="Geo_Country", type="left")
Singleton.country.counts[is.na(Singleton.country.counts$per.country),"per.country"] <- 0
Singleton.country.counts$lineage.type <- "Singleton in country"

Singleton.country.counts
TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage=="Singleton","TPA_Lineage"]

Private.country.counts <- unique(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage!="Singleton",c("TPA.pinecone.sublineage", "Geo_Country")]) %>% 
  dplyr::group_by(TPA.pinecone.sublineage) %>%
  summarise(Countries.count=n(),.groups="keep")
Private.country.counts$private.distro <- ifelse(Private.country.counts$Countries.count==1,"private","multicountry")

Private.country.counts

# How many private lineages are in each country?
Private.country.locations <- unique(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage %in% as.character(unlist(Private.country.counts[Private.country.counts$private.distro=='private',"TPA.pinecone.sublineage"])),c(c("TPA.pinecone.sublineage", "Geo_Country"))])
Private.country.location.counts <- Private.country.locations %>% dplyr::group_by(Geo_Country) %>%
  dplyr::summarise(per.country=n())
Private.country.location.counts <- plyr::join(data.frame(Geo_Country=c(unique(TPA.meta1.2.pinecone$Geo_Country)),stringsAsFactors = F), Private.country.location.counts, by="Geo_Country", type="left")
Private.country.location.counts[is.na(Private.country.location.counts$per.country),"per.country"] <- 0
Private.country.location.counts$lineage.type <- "Private sublineage to country"

# How many multicountry lineages are in each country?
multi.country.locations.counts <- unique(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage %in% as.character(unlist(Private.country.counts[Private.country.counts$private.distro=='multicountry',"TPA.pinecone.sublineage"])),c(c("TPA.pinecone.sublineage", "Geo_Country"))])
multi.country.locations.counts <- multi.country.locations.counts %>% dplyr::group_by(Geo_Country) %>%
  dplyr::summarise(per.country=n())
multi.country.locations.counts <- plyr::join(data.frame(Geo_Country=c(unique(TPA.meta1.2.pinecone$Geo_Country)),stringsAsFactors = F), multi.country.locations.counts, by="Geo_Country", type="left")
multi.country.locations.counts[is.na(multi.country.locations.counts$per.country),"per.country"] <- 0
multi.country.locations.counts$lineage.type <- "Multi-country sublineage"

classified.sublineages.per.country <- rbind(multi.country.locations.counts,Private.country.location.counts,Singleton.country.counts)
classified.sublineages.per.country$Geo_Country <- factor(classified.sublineages.per.country$Geo_Country, levels=continental.country.cols.brew2$Geo_Country)
classified.sublineages.per.country$lineage.type <- factor(classified.sublineages.per.country$lineage.type, levels=rev(unique(classified.sublineages.per.country$lineage.type)))

plot.classified.sublineages.per.country <- ggplot(classified.sublineages.per.country, aes(Geo_Country, per.country, fill=lineage.type)) +
  geom_bar(position="stack", stat="identity", width=0.75) +
  theme_light() + 
  theme.text.size + 
  x.theme.axis.rotate +
  labs(x="Country",y="Sublineage/Country", fill="Sublineage Type") +
  scale_y_continuous(breaks=seq(0,18,2)) +
  theme(legend.key.size = unit(0.65,"line"),legend.position='right') + 
  ggtitle("Sublineage types by country") + 
  theme(plot.title = element_text(size = 10)) +
  scale_fill_manual(values=c("grey80","grey50","grey10"))
plot.classified.sublineages.per.country

plot.classified.sublineages.per.country.hbar <- ggplot(classified.sublineages.per.country, aes(per.country,Geo_Country, fill=lineage.type)) +
  geom_barh(position="stack", stat="identity", width=0.75) +
  theme_light() + 
  theme.text.size + 
  #x.theme.axis.rotate +
  labs(y="Country",x="Sublineage Count", fill="Sublineage Type") +
  scale_x_continuous(breaks=seq(0,18,2)) +
  theme(legend.key.size = unit(0.65,"line"),legend.position='right') + 
  #ggtitle("Sublineage types by country") + 
  theme(plot.title = element_text(size = 10)) +
  scale_fill_manual(values=c("grey80","grey50","grey10"))
plot.classified.sublineages.per.country.hbar


#plot_grid(p.sublineages.per.country.bar + theme(legend.position="none") + ggtitle("Number of sublineages in each country") + theme(plot.title = element_text(size=10)) + x.theme.strip, plot.classified.sublineages.per.country +theme(legend.position="bottom"), align=T, ncol=1, rel_heights=c(1,3))
```

Look at number of countries for each sublineage (v.s. sampling counts)
```{r}
sublineage.country.distro.vs.total.counts <- plyr::join(data.frame(Private.country.counts,stringsAsFactors = F), sublineage.counts, by="TPA.pinecone.sublineage", type="left")

#library(ggrepel)
ggplot(sublineage.country.distro.vs.total.counts, aes(Count, Countries.count, size=Count, colour=TPA.pinecone.sublineage)) +
  geom_jitter(alpha=0.75) + 
  #geom_label_repel(data=sublineage.country.distro.vs.total.counts, aes(Count, Countries.count, label=TPA.pinecone.sublineage), segment.color = 'grey50', inherit.aes = F, size=2.5, box.padding=0.75) +
  #geom_point(alpha=0.75) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) + 
  theme_light() + 
  scale_size_binned(range = c(1, 10),breaks=c(5,20,50,100,200,400)) +
  scale_x_log10() +
  scale_y_continuous(breaks=seq(0,20,2)) +
  labs(y="Countries (Count)", x="Samples (Count)", size="Sample Count") +
  theme(legend.key.size = unit(0.65,"line"),legend.position='bottom') 

```

Summary of sublineage classifications
```{r}

sublineage.classification <- Private.country.counts
sublineage.classification <- sublineage.classification[order(as.numeric(sublineage.classification$TPA.pinecone.sublineage)),]

# How many private sublineages?
nrow(sublineage.classification[sublineage.classification$private.distro=='private',])
# How many mult-country sublineages?
nrow(sublineage.classification[sublineage.classification$private.distro=='multicountry',])

# Where are the Singletons
Singleton.country.counts[Singleton.country.counts$per.country!=0,]

# Where are the private lineages?
Private.country.location.counts[Private.country.location.counts$per.country!=0,]

```





# Pairwise SNP analyses

```{r}
# import multiple sequence alignment
TPA.WGS.alignment.data <- pairsnp::import_fasta_sparse(TPA.MSA.SNPs.aln.file)

# run pairsnp
TPA.WGS.alignment.data.dist <- pairsnp::snp_dist(TPA.WGS.alignment.data)
TPA.WGS.alignment.data.dist.melt <- reshape2::melt(TPA.WGS.alignment.data.dist)
colnames(TPA.WGS.alignment.data.dist.melt) <- c("Taxa1", "Taxa2", "Distance")

# Bring in and merge metadata
TPA.meta1.2.pairwise.t1 <- TPA.meta1.2.pinecone[,c("Cleaned_fastq_id","Sample_Name","Sample_Year","Geo_Country","Continent","TPA.pinecone.major","TPA.pinecone.sublineage", "TPA_Lineage")]
colnames(TPA.meta1.2.pairwise.t1) <- paste0(colnames(TPA.meta1.2.pairwise.t1),".t1")
colnames(TPA.meta1.2.pairwise.t1)[2] <- "Taxa1"
TPA.meta1.2.pairwise.t2 <- TPA.meta1.2.pinecone[,c("Cleaned_fastq_id","Sample_Name","Sample_Year","Geo_Country","Continent","TPA.pinecone.major","TPA.pinecone.sublineage", "TPA_Lineage")]
colnames(TPA.meta1.2.pairwise.t2) <- paste0(colnames(TPA.meta1.2.pairwise.t2),".t2")
colnames(TPA.meta1.2.pairwise.t2)[2] <- "Taxa2"

TPA.alignment.data.dist.melt.meta <- plyr::join(TPA.WGS.alignment.data.dist.melt,TPA.meta1.2.pairwise.t1, by="Taxa1", type="left") 
TPA.alignment.data.dist.melt.meta <- plyr::join(TPA.alignment.data.dist.melt.meta,TPA.meta1.2.pairwise.t2, by="Taxa2", type="left")

```

Define comparisons
```{r}
# Same sample
TPA.alignment.data.dist.melt.meta$same.sample <- ifelse(TPA.alignment.data.dist.melt.meta$Taxa1==TPA.alignment.data.dist.melt.meta$Taxa2,"same", "different")

# Years between samples
TPA.alignment.data.dist.melt.meta$year.distance <- abs(as.numeric(TPA.alignment.data.dist.melt.meta$Sample_Year.t1) - as.numeric(TPA.alignment.data.dist.melt.meta$Sample_Year.t2))

# Same country
TPA.alignment.data.dist.melt.meta$same.country <- ifelse(TPA.alignment.data.dist.melt.meta$Geo_Country.t1 == TPA.alignment.data.dist.melt.meta$Geo_Country.t2, "same", "different")

# Same continent
TPA.alignment.data.dist.melt.meta$same.continent <- ifelse(TPA.alignment.data.dist.melt.meta$Continent.t1 == TPA.alignment.data.dist.melt.meta$Continent.t2, "same", "different")

# Same TPA Major Lineage
TPA.alignment.data.dist.melt.meta$same.TPA.majorlineage <- ifelse(TPA.alignment.data.dist.melt.meta$TPA.pinecone.major.t1==TPA.alignment.data.dist.melt.meta$TPA.pinecone.major.t2, "same", "different")
TPA.alignment.data.dist.melt.meta$same.TPA.majorlineage <- sapply(1:nrow(TPA.alignment.data.dist.melt.meta), function(x) ifelse((TPA.alignment.data.dist.melt.meta$TPA.pinecone.major.t1[x]=="0" | TPA.alignment.data.dist.melt.meta$TPA.pinecone.major.t2[x]=="0"),NA,TPA.alignment.data.dist.melt.meta$same.TPA.majorlineage[x]))

# Same TPA Lineage (cleaned up classifications)
TPA.alignment.data.dist.melt.meta$same.TPA.Lineage <- ifelse(TPA.alignment.data.dist.melt.meta$TPA_Lineage.t1==TPA.alignment.data.dist.melt.meta$TPA_Lineage.t2, "same", "different")
TPA.alignment.data.dist.melt.meta$same.TPA.Lineage <- sapply(1:nrow(TPA.alignment.data.dist.melt.meta), function(x) ifelse((TPA.alignment.data.dist.melt.meta$TPA_Lineage.t1[x]=="0" | TPA.alignment.data.dist.melt.meta$TPA_Lineage.t2[x]=="0"),NA,TPA.alignment.data.dist.melt.meta$same.TPA.Lineage[x]))


# Same TPA sublineage
TPA.alignment.data.dist.melt.meta$same.TPA.Pinecone.cluster <- ifelse(TPA.alignment.data.dist.melt.meta$TPA.pinecone.sublineage.t1==TPA.alignment.data.dist.melt.meta$TPA.pinecone.sublineage.t2,"same", "different")
TPA.alignment.data.dist.melt.meta$same.TPA.Pinecone.cluster <- sapply(1:nrow(TPA.alignment.data.dist.melt.meta), function(x) ifelse(((TPA.alignment.data.dist.melt.meta$same.sample[x]=="different" & TPA.alignment.data.dist.melt.meta$TPA.pinecone.sublineage.t1[x]=="Singleton") |(TPA.alignment.data.dist.melt.meta$same.sample[x]=="different" & TPA.alignment.data.dist.melt.meta$TPA.pinecone.sublineage.t2[x]=="Singleton")),"different",TPA.alignment.data.dist.melt.meta$same.TPA.Pinecone.cluster[x]))

# Country Comparisons label
TPA.alignment.data.dist.melt.meta$Country_combinations <- paste0(TPA.alignment.data.dist.melt.meta$Geo_Country.t1,"___",TPA.alignment.data.dist.melt.meta$Geo_Country.t2)

```


Do some analysis of SNP distances within each country
```{r}

TPA.alignment.data.dist.melt.meta$Geo_Country.t1 <- factor(TPA.alignment.data.dist.melt.meta$Geo_Country.t1, levels=continental.country.cols.brew2$Geo_Country)

p.pairwise.snps.withinCountry <- ggplot(TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.country=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different"),], aes(Geo_Country.t1, Distance, color=Geo_Country.t1)) +
  geom_sina(alpha=0.25, scale='width', method="d") +
  theme_light() +
  theme.text.size + 
  #theme(axis.text.x = element_text(angle = -45, vjust = 0.5, hjust=0.1)) +
  x.theme.axis.rotate +
  labs(x="Country",y="Pairwise SNPs") +
  scale_color_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme(legend.key.size = unit(0.65,"line"),legend.position='right') + 
  ggtitle("Pairwise SNP distributions between samples from same countries") + 
  theme(plot.title = element_text(size = 10)) +
  NULL
p.pairwise.snps.withinCountry

```


Split up by Major Lineage
```{r}
#TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.country=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different" & TPA.alignment.data.dist.melt.meta$same.TPA_Lineage=="same"),]

# Max pairwise distance within country and within Lineages
max(TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.country=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different" & TPA.alignment.data.dist.melt.meta$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta$TPA_Lineage.t1=="SS14"),"Distance"])

max(TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.country=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different" & TPA.alignment.data.dist.melt.meta$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta$TPA_Lineage.t1=="Nichols"),"Distance"])

p.pairwise.snps.withinCountry.within.Lineage <- ggplot(TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.country=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different" & TPA.alignment.data.dist.melt.meta$same.TPA.majorlineage=="same"),], aes(Geo_Country.t1, Distance, color=Geo_Country.t1)) +
  geom_sina(alpha=0.5, scale='width', method="d") +
  theme_light() +
  theme.text.size + 
  x.theme.axis.rotate +
  labs(x="Country",y="Pairwise SNPs") +
  scale_color_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme(legend.key.size = unit(0.65,"line"),legend.position='right') + 
  ggtitle("Pairwise SNP distributions between samples from same countries by lineage") + 
  theme(plot.title = element_text(size = 10)) +
  facet_grid(TPA_Lineage.t1~.) +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.y = element_text(color = "grey25", size=10)) +
  NULL
p.pairwise.snps.withinCountry.within.Lineage


```

Do a combined plot (all, plus by major lineage)
```{r}

TPA.alignment.data.dist.melt.meta.LineageCountryCombined <- TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.country=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different" & TPA.alignment.data.dist.melt.meta$same.TPA.majorlineage=="same"),c("Geo_Country.t1", "Distance","TPA_Lineage.t1")]
TPA.alignment.data.dist.melt.meta.LineageCountryCombined$TPA_Lineage <- TPA.alignment.data.dist.melt.meta.LineageCountryCombined$TPA_Lineage.t1

TPA.alignment.data.dist.melt.meta.LineageCountryCombined.2 <- TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.country=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different"),c("Geo_Country.t1", "Distance","TPA_Lineage.t1")]
TPA.alignment.data.dist.melt.meta.LineageCountryCombined.2$TPA_Lineage <- "All"


TPA.alignment.data.dist.melt.meta.LineageCountryCombined <- rbind(TPA.alignment.data.dist.melt.meta.LineageCountryCombined, TPA.alignment.data.dist.melt.meta.LineageCountryCombined.2)

p.pairwise.snps.withinCountry.within.all.Lineage <- ggplot(TPA.alignment.data.dist.melt.meta.LineageCountryCombined, aes(Geo_Country.t1, Distance, color=Geo_Country.t1)) +
  geom_sina(alpha=0.5, scale='width', method="d") +
  theme_light() +
  theme.text.size + 
  x.theme.axis.rotate +
  labs(x="Country",y="Pairwise SNPs") +
  scale_color_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme(legend.key.size = unit(0.65,"line"),legend.position='right') + 
  ggtitle("Pairwise SNP distributions among samples within the same country") +
  
  theme(plot.title = element_text(size = 10)) +
  facet_grid(TPA_Lineage~.) +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.y = element_text(color = "grey25", size=10)) +
  NULL
p.pairwise.snps.withinCountry.within.all.Lineage

```




Look at pairwise distances between countries (e.g. minimum pairwise distance)
```{r}
TPA.alignment.data.dist.melt.meta.between.countries <- TPA.alignment.data.dist.melt.meta[TPA.alignment.data.dist.melt.meta$same.country=="different",]

TPA.alignment.data.dist.melt.meta.between.countries.mindist <- TPA.alignment.data.dist.melt.meta.between.countries %>% 
  dplyr::group_by(Country_combinations) %>%
  summarise(min.dist=min(Distance))
TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa1 <- gsub("\\_\\_\\_.+$","",TPA.alignment.data.dist.melt.meta.between.countries.mindist$Country_combinations, perl=T)
TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa2 <- gsub("^.+\\_\\_\\_","",TPA.alignment.data.dist.melt.meta.between.countries.mindist$Country_combinations, perl=T)

TPA.alignment.data.dist.melt.meta.between.countries.mindist$log10.mindist <- log10(TPA.alignment.data.dist.melt.meta.between.countries.mindist$min.dist)

TPA.alignment.data.dist.melt.meta.between.countries.mindist$log2.mindist <- log2(TPA.alignment.data.dist.melt.meta.between.countries.mindist$min.dist)

TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa1 <- factor(TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa1, levels=continental.country.cols.brew2$Geo_Country)
TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa2 <- factor(TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa2, levels=continental.country.cols.brew2$Geo_Country)


#WGS.alignment.data.dist.melt.meta.between.countries.mindist.matrix <- dcast(WGS.alignment.data.dist.melt.meta.between.countries.mindist[,c(3,4,2)], taxa1~taxa2)

taxa1.pwise.country.cols <- unique(TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa1)


p.country.minsnp.heatmap <- ggplot(TPA.alignment.data.dist.melt.meta.between.countries.mindist, aes(Taxa1, Taxa2, fill=min.dist)) +
  geom_tile(color="white") +
  scale_fill_gradient(low="yellow",high="red", trans="log2",name="Minimum Pairwise SNPs") + 
  #theme_classic() +
  theme_light() +
  theme.text.size + 
  scale_x_discrete(position = 'top') +
  #theme(axis.text.x=element_text(angle=90,hjust=0), axis.title.x = element_blank(),
  #      axis.title.y = element_blank()) +
  theme(axis.text.x=element_text(angle=90,vjust=1,hjust=0), axis.title.x = element_blank()) +
  geom_text(aes(label = min.dist), color = "black", size = 2.5) +
  #theme(axis.text.x = element_text(colour=data.frame(continental.country.cols.brew2)[continental.country.cols.brew2$Geo_Country!="Belgium","country.col"]), axis.text.y = element_text(colour=data.frame(continental.country.cols.brew2)[continental.country.cols.brew2$Geo_Country!="Belgium","country.col"])) +
  theme(axis.text.y = element_text(colour=data.frame(continental.country.cols.brew2)[continental.country.cols.brew2$Geo_Country!="Belgium","country.col"]))  +
  theme(legend.key.size = unit(0.65,"line"), legend.position='left') +
  #ggtitle("Minimum Pairwise SNPs between samples from different countries") +
  theme(plot.title = element_text(size = 10)) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + 
  labs(y="Country") +
  NULL



p.country.minsnp.heatmap
```

Samples per country (high quality genomes) - needed to give some context to the heatmap plot
```{r}
TPA.pinecone.genome.counts <- TPA.meta1.2.pinecone %>% dplyr::group_by(Geo_Country) %>%
  dplyr::summarise(Total_samples=n())
TPA.pinecone.genome.counts$Geo_Country <- factor(TPA.pinecone.genome.counts$Geo_Country, levels=continental.country.cols.brew2$Geo_Country)

p.hq.country.hbarplot <- ggplot(TPA.pinecone.genome.counts, aes(Total_samples,Geo_Country,fill=Geo_Country)) +
  geom_barh(stat="identity", position="stack", width=0.75) +
  geom_text(data=TPA.pinecone.genome.counts, aes((Total_samples+25), Geo_Country, label=Total_samples), size=2.5, inherit.aes = F) +
  #theme_classic() + 
  theme_light() +
  scale_x_continuous(breaks=c(0,100,200,300)) +
  coord_cartesian(xlim=c(0,275)) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none') +
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  labs(y="Country", x="Sample Count") +
  #theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) 
  NULL
#p.hq.country.hbarplot


p.hq.country.barplot <- ggplot(TPA.pinecone.genome.counts, aes(Geo_Country, Total_samples,fill=Geo_Country)) +
  geom_bar(stat="identity", position="stack", width=0.75) +
  geom_text(data=TPA.pinecone.genome.counts, aes(Geo_Country, (Total_samples+25), label=Total_samples), size=2.5, inherit.aes = F) +
  theme_light() +
  scale_y_continuous(breaks=c(0,100,200,300)) +
  coord_cartesian(ylim=c(0,275)) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='none') +
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  labs(x="Country", y="Sample Count") +
  x.theme.axis.rotate +
  NULL
#p.hq.country.hbarplot

```


plot together
```{r}
p.country.minsnp.heatmap.counts <- plot_grid(p.country.minsnp.heatmap, p.hq.country.hbarplot + y.theme.strip, align=T, ncol=2, rel_widths=c(5,1))
p.country.minsnp.heatmap.counts 
```


Plot sina of within-country SNPs alongside heatmap of min-SNPs between countries

```{r, fig.width=10, fig.height=5, message=FALSE, warning=FALSE}
multicountry.pairwise.snps.grid <- plot_grid(p.pairwise.snps.withinCountry,p.country.minsnp.heatmap.counts, ncol=2, rel_widths=c(2,3), labels=c('D','E'), label_size=11)
multicountry.pairwise.snps.grid
```


Get combination details for all zero pairings
```{r}
# ensure all possible combos are included by 2-siding
TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations <- rbind(data.frame(min.dist=TPA.alignment.data.dist.melt.meta.between.countries.mindist$min.dist, taxa1=TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa1,taxa2=TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa2, stringsAsFactors = F),data.frame(min.dist=TPA.alignment.data.dist.melt.meta.between.countries.mindist$min.dist, taxa1=TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa2,taxa2=TPA.alignment.data.dist.melt.meta.between.countries.mindist$Taxa1, stringsAsFactors = F)) 
TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations <- unique(TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations)

TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations.zeros <- TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations[TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations$min.dist==0,]

TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations.zeros.combos <- TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations.zeros %>% dplyr::group_by(taxa1, taxa2) %>% dplyr::summarise(count=n())

nrow(TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations.zeros.combos)/2
TPA.alignment.data.dist.melt.meta.between.countries.mindist.combinations.zeros.combos

```



Get specific pairwise stats for UK and Canada
```{r}

TPA.alignment.data.dist.melt.meta.UK.Canada <- (TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$Country_combinations=="UK___Canada" | TPA.alignment.data.dist.melt.meta$Country_combinations=="Canada___UK"),c("Taxa1","Taxa2","Distance", "Country_combinations","year.distance")])

# Number of comparisons
nrow(TPA.alignment.data.dist.melt.meta.UK.Canada[TPA.alignment.data.dist.melt.meta.UK.Canada$Distance==0,])


Canada.UK.zero.comparison.samples <- as.character(unique(TPA.alignment.data.dist.melt.meta.UK.Canada[TPA.alignment.data.dist.melt.meta.UK.Canada$Distance==0,"Taxa1"]))
Canada.UK.zero.comparison.samples <- unique(c(Canada.UK.zero.comparison.samples,as.character(unique(TPA.alignment.data.dist.melt.meta.UK.Canada[TPA.alignment.data.dist.melt.meta.UK.Canada$Distance==0,"Taxa2"]))))

Canada.UK.zero.comparison.samples <- data.frame(Sample_Name=Canada.UK.zero.comparison.samples,stringsAsFactors = F)
Canada.UK.zero.comparison.samples <- plyr::join(Canada.UK.zero.comparison.samples,TPA.meta1.2, by="Sample_Name", type="left")

nrow(Canada.UK.zero.comparison.samples)
nrow(Canada.UK.zero.comparison.samples[Canada.UK.zero.comparison.samples$Geo_Country=="UK",])
nrow(Canada.UK.zero.comparison.samples[Canada.UK.zero.comparison.samples$Geo_Country=="Canada",])


# Look at temporal distance within zero-SNP comparisons between countries
sort(unique(TPA.alignment.data.dist.melt.meta.UK.Canada[TPA.alignment.data.dist.melt.meta.UK.Canada$Distance==0,"year.distance"]))

# Identify samples invovled in a 0 SNP comparison and are 14 years apart
unique(as.vector(as.matrix(TPA.alignment.data.dist.melt.meta.UK.Canada[(TPA.alignment.data.dist.melt.meta.UK.Canada$Distance==0 & TPA.alignment.data.dist.melt.meta.UK.Canada$year.distance==14),c("Taxa1","Taxa2")])))

TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Sample_Name %in% c("TPA_UKBIR049", "TPA_UKBIR030", "TPA_BCC030", "TPA_BCC032", "TPA_UKMAN027", "TPA_UKLEE004"),]

TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$Taxa1 %in% c("TPA_BCC030","TPA_BCC032") & TPA.alignment.data.dist.melt.meta$Taxa2 %in% c("TPA_BCC030","TPA_BCC032")),]
#"TPA_BCC030", "TPA_BCC032" 


min(Canada.UK.zero.comparison.samples[,"Sample_Year"])
max(Canada.UK.zero.comparison.samples[,"Sample_Year"])

TPA.alignment.data.dist.melt.meta
```

Make a plot looking at pairwise distances in UK and Canada
```{r}
TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta <- (TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$Country_combinations=="Canada___UK" | TPA.alignment.data.dist.melt.meta$Country_combinations=="Canada___Canada" |  TPA.alignment.data.dist.melt.meta$Country_combinations=="UK___UK"),])

TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta <- TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta$same.sample=="different",]

TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta$Country_combinations2 <- gsub("UK___UK","England",gsub("Canada___Canada","British Columbia",gsub("Canada___UK","British Columbia v.s. England",TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta$Country_combinations)))


# do distributions plots on PW SNPs for country comparisons
p.UK.canada.pwSNPs.sina <- ggplot(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta, aes(Country_combinations2, Distance)) +
  geom_sina(alpha=0.1,size=1, aes(color=Country_combinations2)) + 
  #geom_boxplot(alpha=0.01, outlier.shape = NA, width=0.25) +
  theme_light() +
  theme.text.size + 
  #scale_y_log10() +
  labs(x="Country Combination",y="Pairwise SNPs") +
  #scale_colour_manual(values=c("#74C476", "cyan3", "#084594")) +
  scale_colour_manual(values=c("#74C476", "grey50", "#084594")) + 
  theme(legend.position='none') +
  ggtitle("Pairwise SNPs within and between British Columbia (Canada) and England (UK)") +
  NULL
p.UK.canada.pwSNPs.sina

# Do distributions of pwSNPs v.s. timepoints
p.UK.canada.pwSNPs.vs.Time.points <- ggplot(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta, aes(year.distance,Distance)) +
  geom_point(size=1, alpha=0.1) +
  geom_density_2d() +
  theme_light() +
  facet_grid(.~Country_combinations2) + 
  scale_y_log10() +
  labs(y="Pairwise SNP distance (log10 scale)", x="Pairwise time distance (years)") +
  theme.text.size +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.x = element_text(color = "grey25", size=10))
#p.UK.canada.pwSNPs.vs.Time.points 

# Breakdown pairwise SNP/time distances by sublineage
TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta$TPA.pinecone.sublineage.t1 <- factor(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta$TPA.pinecone.sublineage.t1, levels=sublineages.cols.brew$sublineage)
p.UK.canada.pwSNPs.vs.Time.points.sublineage.breakdown <- ggplot(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta$same.TPA.Pinecone.cluster=="same",], aes(year.distance,Distance, color=TPA.pinecone.sublineage.t1)) +
  geom_point(size=2, alpha=0.25) +
  geom_density_2d(color="black", alpha=0.5) +
  theme_light() +
  facet_grid(TPA.pinecone.sublineage.t1~Country_combinations2) + 
  #scale_y_log10() +
  labs(y="Pairwise SNP distance (log10 scale)", x="Pairwise time distance (years)") +
  theme.text.size +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.x = element_text(color = "grey25", size=10),strip.text.y = element_text(color = "grey25", size=10)) +
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) + 
  NULL
p.UK.canada.pwSNPs.vs.Time.points.sublineage.breakdown






# do pwSNPs v.s. timepoints using a hexplot to reduce overplotting
p.UK.canada.pwSNPs.vs.Time.hex <- ggplot(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta, aes(year.distance,Distance)) +
  stat_bin_hex(colour="white", na.rm=TRUE, bins = 20) +
  scale_fill_gradientn(colours=c("purple","green"), 
                       name = "Comparison Frequency", breaks=c(1,30,1000),
                       na.value=NA, trans="log10") + 
  facet_grid(.~Country_combinations2) + 
  theme_light() +
  labs(y="Pairwise SNP distance", x="Pairwise time distance (years)") +
  theme.text.size + theme(legend.key.size = unit(0.75,"line")) +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.x = element_text(color = "grey25", size=10)) +
  theme(legend.position="bottom") +
  ggtitle("Pairwise SNPs and Years within and between British Columbia (Canada) and England (UK)") + theme(plot.title = element_text(size = 10))
#p.UK.canada.pwSNPs.vs.Time.hex




# do pwSNPs v.s. timepoints using a hexplot to reduce overplotting (by sublineage)
TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage <- TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta$same.TPA.Pinecone.cluster=="same",]

p.UK.canada.pwSNPs.vs.Time.hex.sublineage.breakdown <- ggplot(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage, aes(year.distance,Distance)) +
  stat_bin_hex(colour="white", na.rm=TRUE, bins = 15) +
  #geom_density_2d_filled() +
  #geom_density_2d_filled() + scale_fill_brewer(palette="PuRd") +
  scale_fill_gradientn(colours=c("purple","green"), 
                       name = "Comparison Frequency", breaks=c(1,30,1000),
                       na.value=NA, trans="log10") + 
  #geom_point(alpha=0.025, size=0.25) +
  #geom_density_2d(color="black", alpha=0.5, bins=4) +
  #facet_grid(TPA.pinecone.sublineage.t1~Country_combinations2) + 
  facet_grid(.~Country_combinations2) + 
  theme_light() +
  #scale_y_log10(breaks=c(0.01,1,5,10,20,40)) + scale_x_log10(breaks=c(0.01,1,5,10,20)) +
  #coord_cartesian(xlim=c(0.1,20), ylim=c(0.1,30)) +
  scale_y_continuous(breaks=seq(0,25,5)) +
  labs(y="Pairwise SNP distance", x="Pairwise time distance (years)") +
  theme.text.size + theme(legend.key.size = unit(0.75,"line")) +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.x=element_text(color="grey25", size=10), strip.text.y=element_text(color="grey25", size=10)) +
  theme(legend.position="bottom") +
  ggtitle("Pairwise SNPs (same sublineage) and Years within and between British Columbia (Canada) and England (UK)") + theme(plot.title = element_text(size = 10))


p.UK.canada.pwSNPs.vs.Time.hex.sublineage.breakdown <- p.UK.canada.pwSNPs.vs.Time.hex.sublineage.breakdown + stat_smooth(method='lm', fullrange=F,se=T, color='black', level=95)
#p.UK.canada.pwSNPs.vs.Time.hex.sublineage.breakdown + stat_smooth(fullrange=F,se=T, color='black', formula=log10(x) ~ log10(x))
#p.UK.canada.pwSNPs.vs.Time.hex.sublineage.breakdown


```
Look at temporal relationships a little more
```{r}
# time distances at 0 SNPs
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance==0,"year.distance"])
max(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance==0,"year.distance"])
min(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance==0,"year.distance"])

# mean SNP distance
max(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance)
# max SNP distance 
max(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance)

# time distances at 26 SNPs
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance==26,"year.distance"])
max(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance==26,"year.distance"])
min(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance==26,"year.distance"])


# time distances at 0 SNPs (Canada)
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance==0 & TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Country_combinations=="Canada___Canada"),"year.distance"])
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance==0 & TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Country_combinations=="UK___UK"),"year.distance"])
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance==0 & TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Country_combinations=="Canada___UK"),"year.distance"])


##############
# SNP distances at 0 time
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance==0,"Distance"])
max(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance==0,"Distance"])
min(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance==0,"Distance"])

max(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance)
# SNP distances at 19 years time
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance==19,"Distance"])
max(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance==19,"Distance"])
min(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance==19,"Distance"])


# SNP distances at 0 time (England)
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance==0 & TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Country_combinations=="UK___UK"),"Distance"])
# SNP distances at 0 time (Canada)
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance==0 & TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Country_combinations=="Canada___Canada"),"Distance"])
# SNP distances at 0 time (Both)
mean(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance==0 & TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Country_combinations=="Canada___UK"),"Distance"])


```



### Look at formally testing for signal

Calculate Pearson's correlation (for real dataset)
```{r}
#TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage[,c("Distance","year.distance")]

#TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage


temporal.real.correlation1 <- cor(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance,TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance)

#temporal.real.correlation1 <- cor.test(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance,TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance)

# Now compare to randomised bootstrap sampling (with replacement)
set.seed(12345)
bootstrap.count <- 1000
temporal.bootstrap.correlation1 <- NULL
for (bootstrap in 1:bootstrap.count){ 
  sample1 <- data.frame(Distance=sample(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$Distance,replace=T), year.distance=TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage$year.distance, stringsAsFactors = F)
  temporal.bootstrap.correlation1 <- c(temporal.bootstrap.correlation1, cor(sample1[,1], sample1[,2]))
}
temporal.bootstrap.correlation1 <- data.frame(Correlation=temporal.bootstrap.correlation1, type="Bootstrap",stringsAsFactors=F)
temporal.bootstrap.correlation1 <- rbind(data.frame(Correlation=temporal.real.correlation1, type="Real", stringsAsFactors=F),temporal.bootstrap.correlation1)


nrow(TPA.alignment.data.dist.melt.meta.UK.Canada.fullmeta.within.sublineage)

temporal.bootstrap.correlation1$pval <- (1+sum(temporal.bootstrap.correlation1[temporal.bootstrap.correlation1$type=="Bootstrap","Correlation"] >= temporal.bootstrap.correlation1[temporal.bootstrap.correlation1$type=="Real","Correlation"]))/(nrow(temporal.bootstrap.correlation1[temporal.bootstrap.correlation1$type=="Bootstrap",]) +1)
# Adjust to minimum sensitivity of method (in case of zero)
temporal.bootstrap.correlation1$pval <- ifelse(temporal.bootstrap.correlation1$pval==0, 1/bootstrap.count,temporal.bootstrap.correlation1$pval)

ggplot(temporal.bootstrap.correlation1, aes(type, Correlation)) + 
  geom_boxplot() + 
  theme_light() +
  #scale_y_log10() +
  geom_text(data=temporal.bootstrap.correlation1[temporal.bootstrap.correlation1$type=="Real",], aes(type, y=0.075, label=paste0("Correlation=",round(temporal.bootstrap.correlation1$Correlation[1],5))), inherit.aes = F) +
  labs(x="Dataset", y="Correlation") +
  geom_text(data=temporal.bootstrap.correlation1[temporal.bootstrap.correlation1$type=="Real",], aes(type, y=0.05, label=paste0("p<",round(temporal.bootstrap.correlation1$pval[1],5))), inherit.aes = F) + 
  ggtitle(paste0("Effect of temporal distance on pairwise genetic distance for BC/England\nsamples of the same sublineage"," (",bootstrap.count," bootstraps)")) + theme(plot.title = element_text(size = 10))
```



Combine pwSNP data with rest of Canada/UK plots
```{r, fig.width=12, fig.height=6, message=FALSE, warning=FALSE}

plot.UK.BC.stats.vs.bubbleplot <- plot_grid(plot.UK.BC.stats.combined + x.theme.strip + ggtitle("Syphilis Incidence and sublineage count") +theme(plot.title = element_text(size = 10)), plot.TPA.sublineage_UK.Canada.temporal.counts.bubbleplot + theme(legend.position="bottom",strip.background = element_blank(),strip.text.x = element_blank()), ncol=1, align=T, rel_heights=c(1,2), labels=c('A','B'), label_size=11)

#plot.UK.BC.stats.vs.bubbleplot_vs_pairwise.SNPs <- plot_grid(plot.UK.BC.stats.vs.bubbleplot, p.UK.canada.pwSNPs.vs.Time.hex, ncol=2, rel_widths=c(5,6), labels=c('','C'), label_size=11)

plot.UK.BC.stats.vs.bubbleplot_vs_pairwise.SNPs <- plot_grid(plot.UK.BC.stats.vs.bubbleplot, p.UK.canada.pwSNPs.sina, ncol=2, rel_widths=c(7,5), labels=c('','C'), label_size=11)



plot.UK.BC.stats.vs.bubbleplot_vs_pairwise.SNPs
```


Combine to make a BC vs UK only plot
```{r, fig.width=8, fig.height=11, message=FALSE, warning=FALSE}

#plot.UK.BC.stats.vs.bubbleplot
#p.UK.canada.pwSNPs.vs.Time.hex.sublineage.breakdown


#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure13_Canada-vs-UK_distros_03-2021.svg"), width = 600, height = 1050,type="svg",units = "pt")
plot_grid(plot.UK.BC.stats.vs.bubbleplot, p.UK.canada.pwSNPs.sina, p.UK.canada.pwSNPs.vs.Time.hex.sublineage.breakdown, ncol=1, labels=c('','C','D'), label_size=11, scale=0.95, rel_heights=c(4,2,3))
#dev.off()
```




Combine Canada/UK analysis with Global Pairwise analysis
```{r, fig.width=14, fig.height=12, message=FALSE, warning=FALSE}


plot_grid(plot.UK.BC.stats.vs.bubbleplot_vs_pairwise.SNPs, multicountry.pairwise.snps.grid, ncol=1, scale=0.95)

```






Do pairwise SNPs within sublineages
```{r}

#TPA.alignment.data.dist.melt.meta$Geo_Country.t1 <- factor(TPA.alignment.data.dist.melt.meta$Geo_Country.t1, levels=continental.country.cols.brew2$Geo_Country)

TPA.alignment.data.dist.melt.meta$TPA.pinecone.sublineage.t1 <- factor(TPA.alignment.data.dist.melt.meta$TPA.pinecone.sublineage.t1, levels=sublineages.cols.brew$sublineage)

#scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage)

p.pairwise.snps.within.Sublineage <- ggplot(TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.TPA.Pinecone.cluster=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different"),], aes(TPA.pinecone.sublineage.t1, Distance, color=TPA.pinecone.sublineage.t1)) +
  geom_violin(scale='width') +
  geom_sina(alpha=0.5, scale='width', method="d") + 
  theme_light() +
  theme.text.size + 
  #x.theme.axis.rotate +
  labs(x="Sublineage",y="Pairwise SNPs") +
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme(legend.key.size = unit(0.65,"line"),legend.position='none') + 
  ggtitle("Pairwise SNP distributions between samples from same sublineage") + 
  theme(plot.title = element_text(size = 10)) + 
  NULL


p.pairwise.snps.within.Sublineage

```

Just check the maximum pairwise SNPs within sublineages

```{r}
max(TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.TPA.Pinecone.cluster=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different"),"Distance"])

max(TPA.alignment.data.dist.melt.meta[(TPA.alignment.data.dist.melt.meta$same.TPA.Pinecone.cluster=="same" & TPA.alignment.data.dist.melt.meta$same.sample=="different" & TPA.alignment.data.dist.melt.meta$TPA.pinecone.sublineage.t1==19),"Distance"])

```




# networks of lineage sharing

Create a network of country links (simply based on cooccurrence of sublineages) - make a heatmap based on the linklist

```{r}

country.combinations <- data.frame(t(combn(data.frame(continental.country.cols.brew2)[continental.country.cols.brew2$Geo_Country!="Belgium","Geo_Country"],2, simplify=T)),stringsAsFactors = F)
colnames(country.combinations) <- c("taxa1","taxa2")
country.combinations <- rbind(country.combinations,data.frame(taxa1=country.combinations$taxa2, taxa2=country.combinations$taxa1, stringsAsFactors = F))
country.combinations$combo <- paste0(country.combinations$taxa1,"___",country.combinations$taxa2)


sublineage.country.summary.simple <- data.frame(TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage!="Singleton",] %>% dplyr::group_by(TPA.pinecone.sublineage,Geo_Country) %>% 
  dplyr::summarise(total.samples=n()), stringsAsFactors = F)
sublineage.country.summary.simple <- plyr::join(sublineage.country.summary.simple,unique(TPA.meta1.2.pinecone[,c("TPA.pinecone.sublineage","TPA.pinecone.major")]), by="TPA.pinecone.sublineage")


#sublineage.sharing.links.all <- sublineage.sharing.list
sublineage.sharing.links.all <- data.frame(sublineage.country.summary.simple %>% dplyr::group_by(TPA.pinecone.sublineage) %>% dplyr::summarise(no_links = length(TPA.pinecone.sublineage)),stringsAsFactors = F)
sublineage.sharing.links <- subset(sublineage.sharing.links.all, no_links>1)


# Make list of interaction combos (i.e. network edges)
linklist.all <- NULL
for (current in sublineage.sharing.links.all$TPA.pinecone.sublineage){
  current.sublineage <- subset(sublineage.country.summary.simple,TPA.pinecone.sublineage==current)
  if (nrow(current.sublineage)>1){
    current.sublineage1 <- data.frame(taxa1=t(combn(current.sublineage$Geo_Country, 2, FUN=NULL, simplify=T))[,1],taxa2=t(combn(current.sublineage$Geo_Country, 2, FUN=NULL, simplify=T))[,2],stringsAsFactors=T)
  current.sublineage1$sublineage <- current
  linklist.all <- rbind(linklist.all, current.sublineage1)
  }
}

linklist.all <- data.frame(linklist.all, stringsAsFactors=F)
linklist.all$combo <- paste0(linklist.all$taxa1,"___",linklist.all$taxa2)
linklist.all$combo2 <- paste0(linklist.all$taxa2,"___",linklist.all$taxa1)


linklist.all <- rbind(data.frame(taxa1=linklist.all$taxa1,taxa2=linklist.all$taxa2, sublineage=linklist.all$sublineage,combo=linklist.all$combo, stringsAsFactors =F), data.frame(taxa1=linklist.all$taxa2,taxa2=linklist.all$taxa1, sublineage=linklist.all$sublineage,combo=linklist.all$combo2, stringsAsFactors =F))


linklist.all.frequency <- linklist.all %>% dplyr::group_by(combo, .drop=F) %>%
  dplyr::summarise(Sublineage.Count=n())
linklist.all <- plyr::join(linklist.all,linklist.all.frequency, by="combo", type='left')

linklist.all2 <- plyr::join(country.combinations[,c("taxa1","taxa2","combo")],linklist.all[,c("taxa1","taxa2","combo","Sublineage.Count")], type="left")
linklist.all2 <- linklist.all2[rev(order(linklist.all2$combo, linklist.all2$Sublineage.Count)),]
linklist.all2 <-linklist.all2[!duplicated(linklist.all2),]
linklist.all <- linklist.all2


linklist.all[is.na(linklist.all$Sublineage.Count),"Sublineage.Count"] <- 0


linklist.all$taxa1 <- factor(linklist.all$taxa1, levels=continental.country.cols.brew2$Geo_Country)
linklist.all$taxa2 <- factor(linklist.all$taxa2, levels=continental.country.cols.brew2$Geo_Country)

p.country.sublineage.lings.heatmap <- ggplot(linklist.all, aes(taxa1, taxa2, fill=Sublineage.Count)) +
  geom_tile(color="white") +
  scale_fill_gradient(low="#ffffcc",high="#2c7fb8", name="Shared\nsublineages") + 
  #theme_classic() + 
  theme_light() + 
  theme.text.size + 
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5), axis.title.x = element_blank(),
        axis.title.y = element_blank()) +
  geom_text(aes(label = Sublineage.Count), color = "black", size = 2.5) +
  theme(axis.text.x = element_text(colour=data.frame(continental.country.cols.brew2)[continental.country.cols.brew2$Geo_Country!="Belgium","country.col"]), axis.text.y = element_text(colour=data.frame(continental.country.cols.brew2)[continental.country.cols.brew2$Geo_Country!="Belgium","country.col"])) +
  theme(legend.key.size = unit(0.75,"line")) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  ggtitle("Number of sublineages shared between countries") + theme(plot.title = element_text(size = 10))
  
p.country.sublineage.lings.heatmap

```

Plot lineage sharing network with classified lineage counts
```{r, fig.width=8, fig.height=7, message=FALSE, warning=FALSE}

p.shared.sublineage.legend <- plot_grid(get_legend(plot.classified.sublineages.per.country + theme(legend.position='right')), get_legend(p.country.sublineage.lings.heatmap + theme(legend.position='right')), ncol=1, rel_heights=c(2,3))

p.shared.sublineage.plotgrid <-  plot_grid(plot.classified.sublineages.per.country + theme(legend.position="none") + ggtitle("Number of sublineages in each country") + theme(plot.title = element_text(size=10)) + x.theme.strip, p.country.sublineage.lings.heatmap + theme(legend.position="none"), align=T, ncol=1, rel_heights=c(2,3),labels=c('A','B'), label_size=11)

plot_grid(p.shared.sublineage.plotgrid,p.shared.sublineage.legend, ncol=2, rel_widths=c(4,1))


```



Try plot a different way (02-2021) to focus on global analysis alone
```{r, fig.width=10, fig.height=10, message=FALSE, warning=FALSE}

# Sort labels
plot.pairwise.SNPs.combi.5 <- plot_grid(plot.classified.sublineages.per.country + x.theme.strip + theme(legend.position="none") + labs(y="Sublineage Count"), p.hq.country.barplot + x.theme.strip + ggtitle('Sample count by country') + theme(plot.title=element_text(size=10)), 
                                        p.country.minsnp.heatmap + theme(legend.position="none") + ggtitle('Minimum pairwise SNP distance between countries') + theme(plot.title=element_text(size=10)), ncol=1, rel_heights=c(1,1,4), align="v", axis="lr", labels=c('A','B','D'), label_size=11)

plot.pairwise.SNPs.combi.5 <- plot_grid(plot.pairwise.SNPs.combi.5, p.pairwise.snps.withinCountry.within.all.Lineage + theme(legend.position='none'), ncol=2, labels=c('','C'), label_size=11, scale=0.95)



# Sort labels
plot.pairwise.SNPs.combi.6 <- plot_grid(plot.classified.sublineages.per.country + x.theme.strip + theme(legend.position="none") + labs(y="Sublineage Count"), p.hq.country.barplot + ggtitle('Sample count by country') + theme(plot.title=element_text(size=10)), 
                                        p.country.minsnp.heatmap + theme(legend.position="none") + ggtitle('Minimum pairwise SNP distance between countries') + theme(plot.title=element_text(size=10)), ncol=1, rel_heights=c(1,2,4), align="v", axis="lr", labels=c('A','B','D'), label_size=11)

plot.pairwise.SNPs.combi.6 <- plot_grid(plot.pairwise.SNPs.combi.6, p.pairwise.snps.withinCountry.within.all.Lineage + theme(legend.position='none'), ncol=2, labels=c('','C'), label_size=11, scale=0.95)




pwSNPs.legend <- get_legend(p.pairwise.snps.withinCountry.within.all.Lineage + theme(legend.position='top'))
pwSNPs.heatmap.legend <- get_legend(p.country.minsnp.heatmap + theme(legend.position='top'))
sublineages.legend <- get_legend(plot.classified.sublineages.per.country + theme(legend.position='top') + guides(fill=guide_legend(nrow=3)))
countries.bar.legend <- get_legend(p.hq.country.barplot + theme(legend.position='top') + guides(fill=guide_legend(nrow=6)))

Figure3.combi.legend <- plot_grid(sublineages.legend,pwSNPs.heatmap.legend,countries.bar.legend, nrow=1, rel_widths=c(2,1,3))

pwSNPs.legend.vert <- get_legend(p.pairwise.snps.withinCountry.within.all.Lineage + theme(legend.position='left'))
pwSNPs.heatmap.legend.vert <- get_legend(p.country.minsnp.heatmap + theme(legend.position='left'))
sublineages.legend.vert <- get_legend(plot.classified.sublineages.per.country + theme(legend.position='left') + guides(fill=guide_legend(ncol=1)))
countries.bar.legend.vert <- get_legend(p.hq.country.barplot + theme(legend.position='left') + guides(fill=guide_legend(ncol=1)))

Figure3.combi.legend.vert <- plot_grid(sublineages.legend.vert,pwSNPs.heatmap.legend.vert,countries.bar.legend.vert, ncol=1, rel_heights=c(2,1,3))



####
# Version 7 of this complex and much commented on plot ;-) 

Figure3.combi.legend.mixit1 <- plot_grid(sublineages.legend,countries.bar.legend, nrow=2, rel_heights=c(2,3))

# Sort Figure and labels
plot.pairwise.SNPs.combi.7 <- plot_grid(plot.classified.sublineages.per.country + x.theme.strip + theme(legend.position="none") + labs(y="Sublineage Count"), p.hq.country.barplot + x.theme.strip + ggtitle('Sample count by country') + theme(plot.title=element_text(size=10)), 
                                        p.country.minsnp.heatmap + theme(legend.position="none") + ggtitle('Minimum pairwise SNP distance between countries') + theme(plot.title=element_text(size=10)), ncol=1, rel_heights=c(1,1,4), align="v", axis="lr", labels=c('A','B','D'), label_size=11)
plot.pairwise.SNPs.combi.7.a <- plot_grid(plot.pairwise.SNPs.combi.7, pwSNPs.heatmap.legend, rel_heights=c(11,1),ncol=1)

plot.pairwise.SNPs.combi.7.b <- plot_grid(Figure3.combi.legend.mixit1,p.pairwise.snps.withinCountry.within.all.Lineage + theme(legend.position='none'),ncol=1, rel_heights=c(1,4), labels=c('','C'), label_size=11, scale=0.95)  

plot.pairwise.SNPs.combi.7.c <- plot_grid(plot.pairwise.SNPs.combi.7.a, plot.pairwise.SNPs.combi.7.b, ncol=2)






#Cairo::Cairo(file=paste0(Figure_output_directory, "Figure4_Sublin+PairwiseSNPs__Global-distro_02-2021.svg"), width = 800, height = 800,type="svg",units = "pt")
plot.pairwise.SNPs.combi.7.c
#dev.off()
```





\
# need to do a tree that only highlights singleton or private sublineages\

```{r, fig.width=10, fig.height=10, message=FALSE, warning=FALSE}

private.singleton.lineages <- data.frame(Private.country.counts[Private.country.counts$private.distro=="private",c("TPA.pinecone.sublineage","private.distro")])

private.singleton.lineages <- rbind(private.singleton.lineages,data.frame(TPA.pinecone.sublineage="Singleton",private.distro="Singleton",stringsAsFactors = F))
private.singleton.lineages$private.sublineages <- private.singleton.lineages$TPA.pinecone.sublineage
private.singleton.lineages <- private.singleton.lineages[,c("TPA.pinecone.sublineage","private.sublineages")]

private.singleton.samples <-  plyr::join(TPA.meta1.2.pinecone[,c("Sample_Name","TPA.pinecone.sublineage")],private.singleton.lineages, type="left", by="TPA.pinecone.sublineage")

private.singleton.samples <- data.frame(row.names=private.singleton.samples$Sample_Name, "Private or Singleton\nSublineage"=private.singleton.samples$private.sublineages)


# add private lineage strip to tree 
p.TPA.MLtree.sublineages.privatelineage <- gheatmap(TPA.MLtree.ggtree.tippoint + theme(legend.position="none"),
               private.singleton.samples, color='grey70',width=0.075,offset=0.00000725, colnames_angle=-45,colnames_offset_y=0, hjust=0,font.size=2) + 
  scale_fill_manual(name="Private & Singleton\nSublineages",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right')
p.TPA.MLtree.sublineages.privatelineage <- p.TPA.MLtree.sublineages.privatelineage + new_scale_fill()


p.TPA.MLtree.sublineages.privatelineage <-gheatmap(p.TPA.MLtree.sublineages.privatelineage,TPA.rawseq.countries.p, color='grey70',width=0.075,offset=0.00001725, colnames_angle=-45,colnames_offset_y=0, hjust=0,font.size=2) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right') +
  NULL



#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure11__MLtree_private+singleton_02-2021.svg"), width = 800, height = 800,type="svg",units = "pt")
p.TPA.MLtree.sublineages.privatelineage
#dev.off()


```




Need to make a subtree highlighting the Reference strains from Nichols lineage
```{r}
#Nichols.coll + geom_text2(aes(subset=!isTip, label=node), hjust=-.3, size=2.5) 
#ggtree(Nichols.ref.subtree.nodeid.tree) + geom_text2(aes(subset=!isTip, label=node), hjust=-.3) 

#Nichols.reference.clade.Year.data <- data.frame(row.names=TPA.meta1.2.pinecone.havedates$Sample_Name, TPA.meta1.2.pinecone.havedates$Sample_Year,stringsAsFactors=F)


Nichols.ref.subtree.nodeid <- 976
#Nichols.ref.subtree.nodeid <- 979

Nichols.ref.subtree.nodeid.tree <- tree_subset(TPA.MLtree, node=Nichols.ref.subtree.nodeid,levels_back=0)
#ggtree(Nichols.ref.subtree.nodeid.tree) + geom_tiplab(size=2.5) 



p.Nichols.ref.subtree.nodeid.tree <- ggtree(Nichols.ref.subtree.nodeid.tree) %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage) + 
  geom_tippoint(aes(color=Sublineage), size=2.5, alpha=0.25,show.legend=F) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  # add bootstrap support
  geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=2, shape=18) +
  geom_tiplab(size=2.5,align=F) + 
  geom_treescale(fontsize = 2.5, x=0.00001, y=12) +
  #xlim(0, 0.00007) +
  NULL

p.Nichols.ref.subtree.nodeid.tree.hm <- gheatmap(p.Nichols.ref.subtree.nodeid.tree,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage), color=NULL,width=0.065,offset=0.0000095, colnames_angle=0,colnames_offset_y=-0.01, font.size=2) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.75,"line"),legend.position='right') +
  geom_rootedge(0.00000075) 

p.Nichols.ref.subtree.nodeid.tree.hm <- p.Nichols.ref.subtree.nodeid.tree.hm + new_scale_fill()


p.Nichols.ref.subtree.nodeid.tree.hm <- gheatmap(p.Nichols.ref.subtree.nodeid.tree.hm, data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Year=TPA.meta1.2.pinecone$Sample_5year.window, stringsAsFactors = F), color=NULL,width=0.065,offset=0.0000135, colnames_angle=0,colnames_offset_y=-0.01, font.size=2) + scale_fill_manual(name="Year",values=TPA.5year.window.brewcols$window.5year.cols[1:(length(TPA.5year.window.brewcols$window.5year.cols)-1)], breaks=TPA.5year.window.brewcols$window.5year[1:(length(TPA.5year.window.brewcols$window.5year)-1)]) #+
  #theme.text.size + theme(legend.key.size = unit(0.75,"line"),legend.position='left')


p.Nichols.ref.subtree.nodeid.tree.hm <- p.Nichols.ref.subtree.nodeid.tree.hm + new_scale_fill()
p.Nichols.ref.subtree.nodeid.tree.hm


```


Plot main Nichols tree, but with highlight for relevant clade
```{r}
p.TPA.Nichols.coll.highlight <- p.TPA.Nichols.coll + new_scale_fill()
p.TPA.Nichols.coll.highlight <- p.TPA.Nichols.coll.highlight  + geom_hilight(node=Nichols.ref.subtree.nodeid, alpha=0.2, fill="grey45")
#p.TPA.Nichols.coll.highlight

# Add sample year (group)
p.TPA.Nichols.coll.highlight <- gheatmap(p.TPA.Nichols.coll.highlight,data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Year=TPA.meta1.2.pinecone$Sample_5year.window, stringsAsFactors = F), color=NULL,width=0.085,offset=0.00002025, colnames_angle=0,colnames_offset_y=-1, font.size=2) + scale_fill_manual(name="Year",values=TPA.5year.window.brewcols$window.5year.cols[1:(length(TPA.5year.window.brewcols$window.5year.cols)-1)], breaks=TPA.5year.window.brewcols$window.5year[1:(length(TPA.5year.window.brewcols$window.5year)-1)]) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='left')

p.TPA.Nichols.coll.highlight <- p.TPA.Nichols.coll.highlight + new_scale_fill()
```



plot Nichols trees together
```{r, fig.width=12, fig.height=7, message=FALSE, warning=FALSE}

p.Nichols.ref.subtree.nodeid.tree.hm.grid <- plot_grid(NULL,p.Nichols.ref.subtree.nodeid.tree.hm + theme(legend.position="none") + ggtitle("Subtree") + theme(plot.title = element_text(size = 10)), NULL, rel_heights=c(1,4,1),ncol=1, labels=c('','B',''), label_size=11, vjust=0)

p.Nichols.ref.subtree.nodeid.tree.hm.grid.final <- plot_grid(p.TPA.Nichols.coll.highlight + theme(legend.position="left") + ggtitle("Nichols-lineage phylogeny") + theme(plot.title = element_text(size = 10)),NULL,p.Nichols.ref.subtree.nodeid.tree.hm.grid, ncol=3, rel_widths=c(10,1,10), labels=c('A','',''), vjust=1, label_size=11)


#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure6_MLtree_Nichols-reference-highlight__02-2021.svg"), width = 800, height = 500,type="svg",units = "pt")
p.Nichols.ref.subtree.nodeid.tree.hm.grid.final
#dev.off()

```

subset Nichols to show outgroups better
```{r, fig.width=12, fig.height=8, message=FALSE, warning=FALSE}

# sublineage 19 
sublineages.tocollapse.nodeid.19 <- 997
# Sublineage 14 (formerly called 19 in older analysis before bootstrapping)
sublineages.tocollapse.nodeid.14 <- 997

# sublineage 12 
sublineages.tocollapse.nodeid.12 <- 963



# Collapse SS14 clade and largest Nichols sublineage to make for easier viewing
Nichols.coll.2clades <- ggtree(TPA.MLtree) %>% collapse(node=SS14.subtree.nodeid) %>% 
  collapse(node=sublineages.tocollapse.nodeid.19) #%>%
  #collapse(node=sublineages.tocollapse.nodeid.12) 

# Add some extra to y axis for spacing
Nichols.coll.2clades$data[Nichols.coll.2clades$data$node==SS14.subtree.nodeid,"y"] <- Nichols.coll.2clades$data[Nichols.coll.2clades$data$node==SS14.subtree.nodeid,"y"] + 8

Nichols.coll.2clades$data[Nichols.coll.2clades$data$node==sublineages.tocollapse.nodeid.14,"y"] <- Nichols.coll.2clades$data[Nichols.coll.2clades$data$node==sublineages.tocollapse.nodeid.14,"y"] + 3

#Nichols.coll.2clades$data[Nichols.coll.2clades$data$node==sublineages.tocollapse.nodeid.12,"y"] <- Nichols.coll.2clades$data[Nichols.coll.2clades$data$node==sublineages.tocollapse.nodeid.12,"y"] + 5
#sublineages.tocollapse.nodeid.12


# Add first triangle
Nichols.coll.2clades <- Nichols.coll.2clades + geom_text2(aes(subset=(node == SS14.subtree.nodeid)), size=20, label=intToUtf8(9664), hjust=0.2,vjust=.55, family="OpenSansEmoji", color="indianred1", alpha=.75)
Nichols.coll.2clades <- Nichols.coll.2clades  + geom_text2(aes(subset=(node == SS14.subtree.nodeid)), cex=2.5, vjust=0.2, label="SS14",hjust = -1.5)


# Add second triangle
#Nichols.coll.2clades <- Nichols.coll.2clades + geom_text2(aes(subset=(node == sublineages.tocollapse.nodeid.19)), size=20, label=intToUtf8(9664), hjust=0.2,vjust=.55, family="OpenSansEmoji", color=sublineages.cols.brew[sublineages.cols.brew$sublineage==19,"sublineage.cols"], alpha=.75)
#Nichols.coll.2clades <- Nichols.coll.2clades  + geom_text2(aes(subset=(node == sublineages.tocollapse.nodeid.19)), cex=2.5, vjust=0.2, label="Sublineage 19",hjust = -0.5)

Nichols.coll.2clades <- Nichols.coll.2clades + geom_text2(aes(subset=(node == sublineages.tocollapse.nodeid.14)), size=20, label=intToUtf8(9664), hjust=0.2,vjust=.55, family="OpenSansEmoji", color=sublineages.cols.brew[sublineages.cols.brew$sublineage==14,"sublineage.cols"], alpha=.75)
Nichols.coll.2clades <- Nichols.coll.2clades  + geom_text2(aes(subset=(node == sublineages.tocollapse.nodeid.14)), cex=2.5, vjust=0.2, label="Sublineage 14",hjust = -0.5)



p.Nichols.coll.2clades <- Nichols.coll.2clades %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage) +
  geom_tippoint(aes(color=Sublineage), size=2.5, alpha=0.25,show.legend=F) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  # add bootstrap support
  geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=2, shape=18) +
  geom_tiplab(size=2.5,align=F) + 
  geom_treescale(fontsize = 2.5, x=0.00001, y=25) + 
  geom_tiplab(size=2.5) + 
  geom_hilight(node=1055, alpha=0.25, fill=sublineages.cols.brew[sublineages.cols.brew$sublineage==6,"sublineage.cols"]) +
  geom_hilight(node=957, alpha=0.25, fill=sublineages.cols.brew[sublineages.cols.brew$sublineage==7,"sublineage.cols"])

p.Nichols.coll.2clades <- p.Nichols.coll.2clades + new_scale_fill()

p.Nichols.coll.2clades.hm <- gheatmap(p.Nichols.coll.2clades,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage), color=NULL,width=0.06,offset=0.0000095, colnames_angle=0,colnames_offset_y=-0.01, font.size=2.25) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right')

p.Nichols.coll.2clades.hm <- p.Nichols.coll.2clades.hm + new_scale_fill()

p.Nichols.coll.2clades.hm <- gheatmap(p.Nichols.coll.2clades.hm,TPA.rawseq.countries.p, color=NULL,width=0.06,offset=0.0000165, colnames_angle=0,colnames_offset_y=-0.01, font.size=2.25) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='bottom') + 
  #geom_treescale(fontsize = 2.5, x=0.000001, y=35) +
  NULL
p.Nichols.coll.2clades.hm <- p.Nichols.coll.2clades.hm + new_scale_fill()


p.Nichols.coll.2clades.hm <- gheatmap(p.Nichols.coll.2clades.hm, data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Year=TPA.meta1.2.pinecone$Sample_5year.window, stringsAsFactors = F), color=NULL,width=0.06,offset=0.0000235, colnames_angle=0,colnames_offset_y=-0.01, font.size=2.25) + scale_fill_manual(name="Year",values=TPA.5year.window.brewcols$window.5year.cols[1:(length(TPA.5year.window.brewcols$window.5year.cols)-1)], breaks=TPA.5year.window.brewcols$window.5year[1:(length(TPA.5year.window.brewcols$window.5year)-1)]) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='left') + 
  ylim(-1,57)

p.TPA.Nichols.coll.highlight <- p.TPA.Nichols.coll.highlight + new_scale_fill()



#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure5_MLtree_highlight-outgroups__02-2021.svg"), width = 700, height = 500,type="svg",units = "pt")
p.Nichols.coll.2clades.hm + ggtitle("Nichols-lineage phylogeny with collapsed nodes") + theme(plot.title = element_text(size = 10))
#dev.off()
```



Subset SS14 tree to show outgroups
```{r, fig.width=8, fig.height=8, message=FALSE, warning=FALSE}

#SS14.coll + geom_text2(aes(subset=!isTip, label=node), hjust=-.3, size=2.5)

#ss14.sublin2.collapse.node <- 542
ss14.sublin1.collapse.node <- 534 #535

SS14.subclades1 <- ggtree(TPA.MLtree) %>% collapse(node=Nichols.subtree.nodeid) %>% 
  collapse(node=ss14.sublin1.collapse.node)
# Add some extra to y axis for spacing
SS14.subclades1$data[SS14.subclades1$data$node==Nichols.subtree.nodeid,"y"] <- SS14.subclades1$data[SS14.subclades1$data$node==Nichols.subtree.nodeid,"y"] -5

SS14.subclades1$data[SS14.subclades1$data$node==ss14.sublin1.collapse.node,"y"] <- SS14.subclades1$data[SS14.subclades1$data$node==ss14.sublin1.collapse.node,"y"] + 2

# Add first triangle
SS14.subclades1 <- SS14.subclades1 + geom_text2(aes(subset=(node == Nichols.subtree.nodeid)), size=20, label=intToUtf8(9664), hjust=0.2,vjust=.55, family="OpenSansEmoji", color="royalblue2", alpha=.75)
SS14.subclades1 <- SS14.subclades1 + geom_text2(aes(subset=(node == Nichols.subtree.nodeid)), cex=2.5, vjust=0.2, label="Nichols",hjust = -1.25)
# Add second triangle
SS14.subclades1 <- SS14.subclades1 + geom_text2(aes(subset=(node == ss14.sublin1.collapse.node)), size=20, label=intToUtf8(9664), hjust=0.2,vjust=.55, family="OpenSansEmoji", color=sublineages.cols.brew[sublineages.cols.brew$sublineage==1,"sublineage.cols"], alpha=.75)
SS14.subclades1 <- SS14.subclades1 + geom_text2(aes(subset=(node == ss14.sublin1.collapse.node)), cex=2, vjust=0.2, label="Sublineage 1",hjust = -0.75)


# add tippoint colours (sublineage)
p.SS14.subclades1 <- SS14.subclades1 %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage) +
  geom_tippoint(aes(color=Sublineage), size=1.5, alpha=0.25,show.legend=F) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  # add bootstrap support
  geom_point2(aes(subset=(!isTip & as.numeric(label)>95)),size=2, shape=18) +
  geom_treescale(fontsize = 2.5, x=0.00001, y=25)  
p.SS14.subclades1 <- p.SS14.subclades1 + new_scale_fill()

# add heatmap strips (sublineage)
p.SS14.subclades1.hm <- gheatmap(p.SS14.subclades1,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage), color=NULL,width=0.06,offset=0.0000095, colnames_angle=0,colnames_offset_y=-0.01, font.size=2.25) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right')
p.SS14.subclades1.hm <- p.SS14.subclades1.hm + new_scale_fill()

# add heatmap strips (country)
p.SS14.subclades1.hm <- gheatmap(p.SS14.subclades1.hm,TPA.rawseq.countries.p, color=NULL,width=0.06,offset=0.0000155, colnames_angle=0,colnames_offset_y=-0.01, font.size=2.25) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='bottom') +
  geom_tiplab(size=2) +
  NULL
p.SS14.subclades1.hm <- p.SS14.subclades1.hm + new_scale_fill()

# Add year group
p.SS14.subclades1.hm <- gheatmap(p.SS14.subclades1.hm,data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Year=TPA.meta1.2.pinecone$Sample_5year.window, stringsAsFactors = F), color=NULL,width=0.065,offset=0.0000215, colnames_angle=0,colnames_offset_y=-0.01, font.size=2.25) + scale_fill_manual(name="Year",values=TPA.5year.window.brewcols$window.5year.cols[1:(length(TPA.5year.window.brewcols$window.5year.cols)-1)], breaks=TPA.5year.window.brewcols$window.5year[1:(length(TPA.5year.window.brewcols$window.5year)-1)]) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='left') +
  ylim(-6,67) #ylim(-6,115)  
p.SS14.subclades1.hm <- p.SS14.subclades1.hm + new_scale_fill()


#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure4_MLtree_highlight-SS14__02-2021.svg"), width = 700, height = 800,type="svg",units = "pt")
p.SS14.subclades1.hm + ggtitle("SS14-lineage phylogeny") + theme(plot.title=element_text(size = 10))
#dev.off()

```


\
\
\


### BEAST analysis \
\
\
Subsampling representative tree for BEAST analyis

```{r}

Country.list2 <- data.frame(continental.country.cols.brew2,stringsAsFactors = F)[continental.country.cols.brew2$Geo_Country!="Belgium","Geo_Country"]
sublineage.lineage.list2 <- sublineages.cols.brew[sublineages.cols.brew$sublineage!="Singleton","sublineage"]
#Full.samplelist2 <- TPA.meta1.2.pinecone$Sample_Name
Full.samplelist2 <- TPA.meta1.2.pinecone[((TPA.meta1.2.pinecone$Sample_Year!="-") & !grepl("Nichols",TPA.meta1.2.pinecone$Sample_Name) & !grepl("-",TPA.meta1.2.pinecone$Sample_Year)),"Sample_Name"]
TPA.meta1.2.pinecone.havedates <- TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Sample_Name %in% Full.samplelist2,]


mysamplesize <- 5
total.bootstraps <- 1

all.bootstraps.lists <- NULL
all.bootstraps.trees <- c(rtree(20)) # create with random start tree to force into a multiphylo object
current.bootstrap <- 0
repeat {
  country.sample <- NULL
  for (current.lineage in sublineage.lineage.list2){
    for (current.country in Country.list2){
      current.list <- TPA.meta1.2.pinecone.havedates[(TPA.meta1.2.pinecone.havedates$TPA.pinecone.sublineage==current.lineage & TPA.meta1.2.pinecone.havedates$Geo_Country==current.country),"Sample_Name"]
      current.list <- current.list[!is.na(current.list)]
      if (length(current.list)>1){
        current.sample <- unique(sample(current.list,size=mysamplesize, replace=T))
        country.sample <- c(as.vector(current.sample), country.sample)
      }
    }
  }
  country.sample <- c(country.sample, as.vector(TPA.meta1.2.pinecone.havedates[(TPA.meta1.2.pinecone.havedates$TPA.pinecone.sublineage=="Singleton"),"Sample_Name"]))
  
  current.bootstrap <- current.bootstrap + 1
  current.sample.tree <- (ape::keep.tip(TPA.MLtree, country.sample))
  all.bootstraps.trees <- c(all.bootstraps.trees,c(current.sample.tree),recursive=T)
  all.bootstraps.lists <- c(all.bootstraps.lists, list(country.sample))
  if (current.bootstrap == total.bootstraps){
    break
  }
}
all.bootstraps.trees <- all.bootstraps.trees[c(2:(total.bootstraps+1))] # remove random start tree

#all.bootstraps.trees[[1]]
#subsampled.ML.tips.6pCountry.pSublin <- all.bootstraps.trees[[1]]$tip.label

# Outputs from a previous random loop used below to ensure sample remains the same
subsampled.ML.tips.6pCountry.pSublin <- c("Mexico_A-mcf", "TPA_RUS_Tuva-62", "TPA_RUS_Tuva-58", "TPA_RUS_Tuva-59", "TPA_RUS_Tuva-61", "PHE140073A", "UW116B", "UW186B", "UW213B", "PHE150137A", "PHE150129A", "TPA_UKBRG017", "TPA_BCC161", "TPA_OMI006", "TPA_ALC105", "UW187B", "PHE150177A", "TPA_BCC085", "K3", "SHE_V", "C3", "Q3", "TPA_BCC075", "TPA_USL-BAL-2", "SS14_v2", "UW824B", "PHE140084A", "TPA_HUN200024", "TPA_HUN190022", "TPA_UKBRG004", "TPA_HUN190008", "UW099B", "TPA_USL-SEA-81-8", "TPA_ZIM025", "TPA_ZIM005", "TPA_ZIM009", "TPA_ZIM007", "UW262B", "UW391B", "TPA_BCC139", "TPA_BCC137", "TPA_UKBRG015", "TPA_UKBRG018", "PHE130041A", "TPA_BCC130", "UW376B", "PHE150159A", "UW244B", "UW291B", "TPA_BCC125", "PT_SIF1002", "PT_SIF1196", "PT_SIF0857", "PHE160254A", "PT_SIF1183", "PHE160249A", "PHE170379A", "PT_SIF1020", "PT_SIF1063", "PHE160315A", "PHE170398A", "PHE170380A", "PHE170365A", "UW148B", "UW473B", "UW492B", "UW248B", "TPA_HUN190023", "UW138B", "UW368B", "UW149B", "UW104B", "SMUTp_02", "SMUTp_01", "SMUTp_08", "PT_SIF0954", "PT_SIF1200", "TPA_BCC128", "TPA_BCC127", "TPA_AUSBR-45", "TPA_HUN180007", "PHE160246A", "UW327B", "CW87", "TPA_ESBCN002", "TPA_SWE-467", "TPA_EIR017", "TPA_HUN190017", "PHE160248A", "PHE130053A", "TPA_HUN180001", "TPA_SWE-662", "PHE130051A", "TPA_BCC138", "TPA_AUSBR-113", "TPA_SWE-1352", "PT_SIF0877_3", "PT_SIF1142", "TPA_EIR013", "AU15", "TPA_EIR008", "AU16", "TPA_BCC049", "CW84", "TPA_BCC052", "PHE170392A", "Seattle_81-4", "TPA_USL-SEA-83-2", "UW279B", "TPA_USL-SEA-86-1", "PHE170336B", "PHE150114A", "TPA_OMI021", "CW59", "CW82", "TPA_USL-Phil-3", "BAL3", "BAL73", "TPA_ZIM014", "TPA_ZIM018", "TPA_ZIM015", "PHE150166A", "PHE160306A", "PHE150168A", "TPA_BCC122", "PHE160294A", "TPA_HUN180004", "TPA_BCC136", "TPA_HUN190020", "PHE160287A", "TPA_BCC126", "TPA_BCC169", "TPA_BCC012", "PHE120029A", "PHE120033A", "TPA_USL-Haiti-B", "PHE160283A", "PHE130048A")



subsampled.ML.tips.6pCountry.pSublin.tree <- (ape::keep.tip(TPA.MLtree, subsampled.ML.tips.6pCountry.pSublin))

subsampled.ML.tips.6pCountry.pSublin.tree.data <- data.frame(fortify(subsampled.ML.tips.6pCountry.pSublin.tree),stringsAsFactors = F)
subsampled.ML.tips.6pCountry.pSublin.tree.data$Sample_Name <- subsampled.ML.tips.6pCountry.pSublin.tree.data$label 
subsampled.ML.tips.6pCountry.pSublin.tree.data <- plyr::join(subsampled.ML.tips.6pCountry.pSublin.tree.data, TPA.meta1.2.pinecone.havedates[,c("Sample_Name","TPA.pinecone.major","TPA.pinecone.sublineage", "Sample_Year")], by="Sample_Name", type="left")




plot.subsampled.ML.tips.6pCountry.pSublin.tree <- ggtree(subsampled.ML.tips.6pCountry.pSublin.tree) %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage) + 
  geom_tippoint(aes(color=Sublineage), size=1, alpha=0.5) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) 

plot.subsampled.ML.tips.6pCountry.pSublin.tree <- gheatmap(plot.subsampled.ML.tips.6pCountry.pSublin.tree,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage), color=NULL,width=0.04, colnames_angle=0,colnames_offset_y=-1, font.size=2.5) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.5,"line"))

```

plot subsampled ML subtree
```{r}
plot.subsampled.ML.tips.6pCountry.pSublin.tree
```

Define function to extract and plot root-2-tip data from a tree or subtree
```{r}
# Inputs
# - A maximum likelihood tree in phylo
# - a dataframe with the headers c("Sample_Name","Sample_Year")

plotRootToTip <- function(input.ml.tree, input.dates.df){
  tree.data <- data.frame(ggtree::fortify(input.ml.tree),stringsAsFactors = F)
  tree.data$Sample_Name <- tree.data$label
  tree.data <- plyr::join(tree.data, input.dates.df, by="Sample_Name", type="left")
  RootTotipDistances <- tree.data[tree.data$isTip==TRUE,"x"]
  treeLabels <- tree.data[tree.data$isTip==TRUE,"Sample_Name"]
  ntips <- length(treeLabels)
  treeDates <- as.numeric(tree.data[tree.data$isTip==TRUE,"Sample_Year"])
  maxdate <- max(treeDates)
  mindate <- min(treeDates)
  treeModel <- lm(RootTotipDistances ~ treeDates)
  treeCorrelation <- cor.test(treeDates, RootTotipDistances, method = "pearson", conf.level = 0.95)
  modelSummary <- summary(treeModel)
  xIntercept <- -coef(treeModel)[1]/coef(treeModel)[2]
  RootTotipDF <- data.frame(RootTotipDistances,treeDates)
  
  plot.tree.data.root2tip <- ggplot(data = RootTotipDF, aes(treeDates,  RootTotipDistances)) +
    geom_point(alpha=0.25,size=2, colour = "red") +
    theme_classic() +
    labs(x="Year",y="Root to tip distance") +
    #geom_smooth(method='lm',fullrange=T,se=T) +
    stat_smooth(method='lm',fullrange=T,se=T) +  
    ggtitle(paste0("Slope: ",formatC(modelSummary$coefficients[2], format = "e", digits = 3),"; ",
                  "TMRCA: ",round(xIntercept,1),
                  "\n", "Correlation Coefficient: ",round(treeCorrelation$estimate,3),
                  "; ", "R^2: ", format(modelSummary$r.squared,digits=3),"\n",ntips," tips","; Timespan: ",mindate,"-",maxdate)) +
    theme(plot.title = element_text(size = 9))
  
  return(plot.tree.data.root2tip)
}

```


now plot
```{r}
p.subsampled.ML.tips.6pCountry.pSublin.root2tip <- plotRootToTip(subsampled.ML.tips.6pCountry.pSublin.tree, TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])

p.subsampled.ML.tips.6pCountry.pSublin.root2tip.2versions <- plot_grid(p.subsampled.ML.tips.6pCountry.pSublin.root2tip, p.subsampled.ML.tips.6pCountry.pSublin.root2tip + scale_x_continuous(limits = c(1400,2020)) + coord_cartesian(xlim=c(1400,2020), ylim=c(0,8.5e-5)), ncol=2, labels=c('B','C'), label_size=11) 

p.subsampled.ML.tips.6pCountry.pSublin.root2tip.2versions 
```

Plot ML Subtree with root-2-tip graph
```{r, fig.width=8, fig.height=8, message=FALSE, warning=FALSE}

#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure15__ML-subtree1-with-root2tip_02-2021.svg"), width = 800, height = 800,type="svg",units = "pt")
plot_grid(plot.subsampled.ML.tips.6pCountry.pSublin.tree +ylim(-1,139), p.subsampled.ML.tips.6pCountry.pSublin.root2tip.2versions, ncol=1, scale=0.9, labels=c('A',''), label_size=11)
#dev.off()
```



Took sequence alignment from this tree, and analysed using BEAST 1.8.4


BEAST analysis (HYK subst model), comparing 

Strict - Constant pop
Strict - Skyline pop (10 cats)
RelLogNormal - Constant pop
RelLogNormal - Exponential pop
RelLogNormal - Skyline pop (10 cats)

- Can't reject strict clock (ucldev.sd overlaps zero subtantially)
- Stepping stone analysis shows Strick-Skyline is best model (although Strict Constant is nearly as close)

Bring in Strict Skyline BEAST tree and plot


```{r, fig.width=10, fig.height=10, message=FALSE, warning=FALSE}
# Bring in beast tree and extract tree data into dataframe
TPA.beast.tree <- read.beast(TPA.beast.subtree.file)
TPA.beast.tree.data <- data.frame(fortify(TPA.beast.tree),stringsAsFactors = F)


# BEAST tipnames have date included - lets remove that for plotting with metadata
TPA.beast.tipnames <- data.frame(beast.name=TPA.beast.tree@phylo$tip.label,stringsAsFactors = F)
TPA.beast.tipnames$meta.name <- gsub("\\|.+$","", TPA.beast.tipnames$beast.name)
TPA.beast.tree@phylo$tip.label <- TPA.beast.tipnames$meta.name
#TPA.beast.tree@phylo$tip.label


# Build plot
TPA.beast.plot1 <- ggtree(TPA.beast.tree,mrsd="2019-06-01",ladderize = T) + 
  theme_tree2() +
  # Add date lines for easy interpretation  
  scale_x_continuous(breaks=c(1300,1400,1500,1600,1700,1800,1850,1900,1925,1950,1975,2000,2020), minor_breaks=seq(1950, 2020, 5)) +
  theme(panel.grid.major   = element_line(color="grey50", size=.2),
        panel.grid.minor   = element_line(color="grey85", size=.2),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) 
# Add posterior support as node points
TPA.beast.plot1 <- TPA.beast.plot1 + geom_point2(aes(subset=(!isTip & as.numeric(posterior)>0.8)),color="gray60",size=2.5,alpha=0.5, shape=18) + 
  geom_point2(aes(subset=(!isTip & as.numeric(posterior)>0.91)),color="gray40",size=2.5,shape=18,alpha=0.5) + 
  geom_point2(aes(subset=(!isTip & as.numeric(posterior)>=0.96)),color="black",size=2.5,shape=18,alpha=0.5)


# Plot 95% HPD intervals - geom_range seems unable to do correctly with this tree (known bug for tip dated trees), so extract data and plot using geom_segment
minmax <- t(matrix(unlist(TPA.beast.tree.data[!is.na(TPA.beast.tree.data$height_0.95_HPD),"height_0.95_HPD"]),nrow=2))
bar_df <- data.frame(node_id=TPA.beast.tree.data[!is.na(TPA.beast.tree.data$height_0.95_HPD),"node"],as.data.frame(minmax))
names(bar_df) <- c('node_id','min','max') 
bar_df <- bar_df %>% filter(node_id > Ntip(TPA.beast.tree@phylo))
bar_df <- bar_df %>% left_join(TPA.beast.plot1$data, by=c('node_id'='node')) %>% select(node_id,min,max,y)
mrcd.decimal <- decimal_date(as.Date("2019-06-01","%Y-%m-%d"))
TPA.beast.plot1 <- TPA.beast.plot1 + geom_segment(aes(x=mrcd.decimal-max, y=y, xend=mrcd.decimal-min, yend=y), data=bar_df, color='red', alpha=0.2, size=2.25)

TPA.beast.plot1 <- TPA.beast.plot1 + new_scale_fill()

TPA.beast.plot1 + geom_tiplab(size=2, align=T)
```


with metadata
```{r, fig.width=10, fig.height=10, message=FALSE, warning=FALSE}

TPA.beast.plot1.meta <- gheatmap(TPA.beast.plot1,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage), color=NULL,width=0.05, offset=2,colnames_angle=0,colnames_offset_y=-1, font.size=2.5) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size
TPA.beast.plot1.meta <- TPA.beast.plot1.meta + new_scale_fill()


TPA.beast.plot1.meta <- gheatmap(TPA.beast.plot1.meta,TPA.rawseq.countries.p, color=NULL,width=0.05,offset=36, colnames_angle=0,colnames_offset_y=-1, font.size=2.5) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size
TPA.beast.plot1.meta <- TPA.beast.plot1.meta + new_scale_fill()
beast.country.legend <- get_legend(TPA.beast.plot1.meta + theme(legend.key.size = unit(0.65,"line"),legend.position='right'))

TPA.beast.plot1.meta <- gheatmap(TPA.beast.plot1.meta,data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Year=TPA.meta1.2.pinecone$Sample_5year.window, stringsAsFactors = F), color=NULL,width=0.05, offset=70,colnames_angle=0,colnames_offset_y=-1, font.size=2.5) + scale_fill_manual(name="Year",values=TPA.5year.window.brewcols$window.5year.cols[1:(length(TPA.5year.window.brewcols$window.5year.cols)-1)], breaks=TPA.5year.window.brewcols$window.5year[1:(length(TPA.5year.window.brewcols$window.5year)-1)]) +
  theme.text.size + 
  #theme(legend.key.size = unit(0.65,"line"),legend.position='left') #+
  #guides(fill=guide_legend(ncol=3)) +
  NULL
TPA.beast.plot1.meta <- TPA.beast.plot1.meta + new_scale_fill()
beast.year.legend <- get_legend(TPA.beast.plot1.meta + theme(legend.key.size = unit(0.65,"line"),legend.position='right'))


TPA.beast.plot1.meta <- TPA.beast.plot1.meta + geom_vline(xintercept = 2000, color='blue', alpha=0.5)
TPA.beast.plot1.meta <- TPA.beast.plot1.meta + annotate("rect",xmin=2000,xmax=2020,ymin=-1,ymax=138,alpha=0.1, fill='blue')
```

Look at and plot skyline data
```{r}
beast.subtree.skyline <- read.table(beast.subtree.skyline.file, sep="\t", check.names=F, comment.char="", header=T, stringsAsFactors=F)

p.beast.subtree.skyline <- ggplot(beast.subtree.skyline, aes(Time,Median)) + 
  geom_line() +
  geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
  theme_light() + 
  scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) +
  scale_y_log10() + 
  coord_cartesian(x=c(1750,2020), y=c(50,3000)) + 
  theme.text.size +
  #labs(y="Median effective population size", x="Year") + 
  labs(y="Relative genetic diversity", x="Year") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) +
  annotate("rect",xmin=2000,xmax=2020,ymin=0,ymax=10000,alpha=0.1, fill='blue')
#p.beast.subtree.skyline
```

Look at lineage preditions
```{r}
beast.subtree.skyline.lineages <- read.table(beast.subtree.skyline.lineage.file, sep="\t", check.names=F, comment.char="", header=T, stringsAsFactors=F)

p.beast.subtree.skyline.lineages <- ggplot(beast.subtree.skyline.lineages, aes(Time,Median)) + 
  geom_line() +
  geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
  theme_light() + 
  scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) +
  scale_y_log10() + 
  coord_cartesian(x=c(1750,2020), y=c(1,300)) + 
  theme.text.size +
  labs(y="Lineages", x="Year") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) +
  annotate("rect",xmin=2000,xmax=2020,ymin=0,ymax=500,alpha=0.1, fill='blue')
#p.beast.subtree.skyline.lineages
```


Make combined plot
```{r, fig.width=10, fig.height=10, message=FALSE, warning=FALSE}
skyline.row <- plot_grid(NULL,p.beast.subtree.skyline,NULL, ncol=3, rel_widths = c(1,3,1))
lineage.row <- plot_grid(NULL,p.beast.subtree.skyline.lineages,NULL, ncol=3, rel_widths = c(1,3,1))

#both.skyline.lineage.rows <- plot_grid(skyline.row, lineage.row, align=T, ncol=1, labels=c('B','C'))
both.skyline.lineage.rows <- plot_grid(skyline.row, align=T, ncol=1, labels=c('B'),label_size=11)


#plot.beast.with.skyline <- plot_grid(TPA.beast.plot1.meta + theme(legend.position='none') + ylim(-1,138), both.skyline.lineage.rows, ncol=1, align=T, rel_heights = c(2,2),labels=c('A',''), label_size=11)

plot.beast.with.skyline <- plot_grid(TPA.beast.plot1.meta + theme(legend.position='none') + ylim(-1,138), both.skyline.lineage.rows, ncol=1, align=T, rel_heights = c(3,1),labels=c('A',''), label_size=11)

beast.legend.combined <- plot_grid(beast.year.legend, NULL, ncol=1, rel_heights=c(3,1))



#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure16_subsample138_BEAST-StrictCSkyline_02-2021.svg"), width = 800, height = 800,type="svg",units = "pt")
plot_grid(beast.legend.combined, plot.beast.with.skyline, ncol=2, rel_widths=c(1,10), labels=c('Key',''), label_size=11)
#dev.off()
```




Pull out MRCA nodes and date ranges from beast subtree and sublineages
```{r, fig.width=10, fig.height=10, message=FALSE, warning=FALSE}
# Subset metadata
TPA.meta1.2.beast.subset1 <- TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Sample_Name %in% as.phylo(TPA.beast.tree)$tip.label,]
# Define key sublineages
#Expanded.sublineages <- data.frame(Sublineage=c(2,3,5,6,12,19), stringsAsFactors = F)
Expanded.sublineages <- data.frame(Sublineage=c(1,2,8,14), stringsAsFactors = F)


# Run loop to extract MRCA node for each sublineage
Expanded.sublineage.nodes <- NULL
for (current.sublineage.exp1 in Expanded.sublineages$Sublineage) {
  Expanded.sublineage.nodes <- c(Expanded.sublineage.nodes, ape::getMRCA(as.phylo(TPA.beast.tree),as.character(TPA.meta1.2.beast.subset1[TPA.meta1.2.beast.subset1$TPA.pinecone.sublineage==current.sublineage.exp1,"Sample_Name"])))
}
Expanded.sublineages$node <- Expanded.sublineage.nodes
#Expanded.sublineages

# Split sublineage 2 in this tree creates a problem. Change it to 180
#Expanded.sublineages[Expanded.sublineages$sublineage==2,]
Expanded.sublineages[Expanded.sublineages$Sublineage==2,"node"] <- 180


TPA.beast.plot1 + geom_text2(aes(subset=!isTip, label=node), hjust=-.3, size=2) +
  geom_point2(aes(subset=(node %in% Expanded.sublineages$node)),color="red") +
  ggtitle("Red nodes indicate MRCA for each sublineage")

```

```{r}
#Extract relevant nodes data from beast tree data
Expanded.sublineage.nodes.beast <- plyr::join(Expanded.sublineages,TPA.beast.tree.data[,c("node","height","height_0.95_HPD","height_median","height_range")], by="node")

# Data is in the form of "height" information - need to convert to years relative to mrcd (2019/06/01)
Expanded.sublineage.nodes.beast$mrca.median <- 2019.5 - Expanded.sublineage.nodes.beast$height_median
Expanded.sublineage.nodes.beast$year <- as.numeric(round(2019.5 - Expanded.sublineage.nodes.beast$height_median,0))


Expanded.sublineage.nodes.beast$mrca.95high <- round(2019.5 - sapply(1:nrow(Expanded.sublineage.nodes.beast),function(x) as.numeric(unlist(Expanded.sublineage.nodes.beast[x,"height_0.95_HPD"]))[1]))

Expanded.sublineage.nodes.beast$mrca.95low <- round(2019.5 - sapply(1:nrow(Expanded.sublineage.nodes.beast),function(x) as.numeric(unlist(Expanded.sublineage.nodes.beast[x,"height_0.95_HPD"]))[2]))


Expanded.sublineage.nodes.beast <- Expanded.sublineage.nodes.beast[order(Expanded.sublineage.nodes.beast$Sublineage),]
Expanded.sublineage.nodes.beast$Sublineage <- factor(Expanded.sublineage.nodes.beast$Sublineage, levels=rev(sublineages.cols.brew$sublineage))
Expanded.sublineage.nodes.beast$sub.order <- rev(c(1:nrow(Expanded.sublineage.nodes.beast)))


```



Repeat BEAST subsampling, but ensure all year are represented (and make a slightly bigger tree)

```{r}

year.list <- sort(unique(TPA.meta1.2.pinecone.havedates$Sample_Year))


mysamplesize.2 <- 3
total.bootstraps.2 <- 1

all.bootstraps.lists.2 <- NULL
all.bootstraps.trees.2 <- c(rtree(20)) # create with random start tree to force into a multiphylo object
current.bootstrap.2 <- 0
repeat {
  country.sample.2 <- NULL
  for (current.year.2 in year.list) { 
    for (current.lineage.2 in sublineage.lineage.list2){
      for (current.country.2 in Country.list2){
        current.list.2 <- TPA.meta1.2.pinecone.havedates[(TPA.meta1.2.pinecone.havedates$TPA.pinecone.sublineage==current.lineage.2 & TPA.meta1.2.pinecone.havedates$Geo_Country==current.country.2 & TPA.meta1.2.pinecone.havedates$Sample_Year==current.year.2),"Sample_Name"]
        current.list.2 <- current.list.2[!is.na(current.list.2)]
        if (length(current.list.2)>1){
          current.sample.2 <- unique(sample(current.list.2,size=mysamplesize.2, replace=T))
          country.sample.2 <- c(as.vector(current.sample.2), country.sample.2)
        }
      }
    }
  }
  country.sample.2 <- c(country.sample.2, as.vector(TPA.meta1.2.pinecone.havedates[(TPA.meta1.2.pinecone.havedates$TPA.pinecone.sublineage=="Singleton"),"Sample_Name"]))
  
  current.bootstrap.2 <- current.bootstrap.2 + 1
  current.sample.tree.2 <- (ape::keep.tip(TPA.MLtree, country.sample.2))
  all.bootstraps.trees.2 <- c(all.bootstraps.trees.2,c(current.sample.tree.2),recursive=T)
  all.bootstraps.lists.2 <- c(all.bootstraps.lists.2, list(country.sample.2))
  if (current.bootstrap.2 == total.bootstraps.2){
    break
  }
}
all.bootstraps.trees.2 <- all.bootstraps.trees.2[c(2:(total.bootstraps.2+1))] # remove random start tree

#all.bootstraps.trees[[1]]
#subsampled.ML.tips.3pv.country.dates.sublin <- all.bootstraps.trees.2[[1]]$tip.label

# Outputs from a previous random loop used below to ensure sample remains the same
subsampled.ML.tips.3pv.country.dates.sublin <- c("Mexico_A-mcf", "TPA_RUS_Tuva-62", "TPA_RUS_Tuva-39", "TPA_RUS_Tuva-61", "PHE140073A", "UW337B", "UW186B", "UW330B", "UW213B", "UW215B", "PHE150129A", "TPA_UKBRG017", "TPA_BCC161", "TPA_OMI006", "UW231B", "UW187B", "K3", "SHE_V", "SHG_I2", "C3", "TPA_BCC165", "TPA_USL-BAL-2", "SS14_v2", "TPA_HUN200024", "TPA_HUN190022", "TPA_USL-SEA-81-8", "TPA_ZIM025", "TPA_ZIM019", "TPA_ZIM007", "UW262B", "UW391B", "PHE150151A", "TPA_UKBIR026", "TPA_OMI002", "TPA_BCC139", "TPA_BCC137", "TPA_UKBRG015", "PHE130041A", "PHE130050A", "UW376B", "PHE150159A", "UW280B", "UW244B", "TPA_BCC125", "PHE160254A", "PT_SIF1183", "PHE160249A", "PHE170379A", "PHE160315A", "PHE160260A", "PHE150131A", "PHE170402A", "PHE170380A", "UW473B", "UW492B", "TPA_USL-Phil-1", "PHE120030A", "PHE120014A", "PHE130043A", "PT_SIF1167", "PHE140093A", "TPA_HUN190023", "TPA_BCC058", "TPA_USL-SEA-87-1", "TPA_UKBIR050", "UW155B", "UW211B", "TPA_BCC038", "TPA_BCC034", "TPA_BCC008", "TPA_BCC009", "UW257B", "UW138B", "UW102B", "UW344B", "UW126B", "SMUTp_01", "SMUTp_08", "UW383B", "TPA_ALC115", "UW823B", "PHE170385A", "UW304B", "PT_SIF0954", "PT_SIF1200", "TPA_BCC128", "PHE130056A", "TPA_ALC036", "TPA_BCC032", "TPA_BCC132", "TPA_AUSBR-45", "TPA_BCC153", "TPA_AUSBR-39", "TPA_HUN180007", "UW411B", "TPA_ESBCN002", "PHE140074A", "TPA_SWE-467", "PHE150126A", "PHE170412A", "PHE150173A", "TPA_UKBRG009", "TPA_ESBCN004", "TPA_BCC123", "TPA_BCC147", "TPA_BCC063", "PHE120024A", "TPA_BCC174", "TPA_SWE-575", "TPA_HUN190017", "PHE160248A", "TPA_BCC176", "TPA_HUN180001", "PHE130054A", "UW852B", "PHE140081A", "UW259B", "TPA_AUSBR-113", "PHE160301A", "TPA_BCC102", "PT_SIF1278", "PT_SIF0877_3", "PHE160265A", "AU15", "TPA_EIR008", "TPA_ALC055", "TPA_BCC175", "PT_SIF1280", "TPA_BCC185", "TPA_BCC111", "TPA_BCC157", "TPA_BCC030", "PHE170409A", "TPA_BCC049", "TPA_BCC061", "TPA_BCC052", "PHE150161A", "TPA_ALC126", "TPA_OMI022", "UW189B", "UW279B", "TPA_OMI021", "PHE170401A", "CW59", "CW82", "BAL73", "TPA_ZIM015", "TPA_ZIM020", "PHE150118A", "PHE170333A", "PHE160302A", "PHE140089A", "PHE170381A", "PHE160263A", "TPA_BCC089", "PHE160316A", "TPA_BCC122", "PHE170386A", "PHE140076A", "PHE150149A", "TPA_BCC136", "PHE150170A", "TPA_UKBRG007", "TPA_BCC081", "TPA_BCC012", "PHE120029A", "PHE120033A", "TPA_USL-Haiti-B")



subsampled.ML.tips.3pv.country.dates.sublin.tree <- (ape::keep.tip(TPA.MLtree, subsampled.ML.tips.3pv.country.dates.sublin))

subsampled.ML.tips.3pv.country.dates.sublin.tree.data <- data.frame(fortify(subsampled.ML.tips.3pv.country.dates.sublin.tree),stringsAsFactors = F)
subsampled.ML.tips.3pv.country.dates.sublin.tree.data$Sample_Name <- subsampled.ML.tips.3pv.country.dates.sublin.tree.data$label 
subsampled.ML.tips.3pv.country.dates.sublin.tree.data <- plyr::join(subsampled.ML.tips.3pv.country.dates.sublin.tree.data, TPA.meta1.2.pinecone.havedates[,c("Sample_Name","TPA.pinecone.major","TPA.pinecone.sublineage", "Sample_Year")], by="Sample_Name", type="left")

subsampled.metalist2 <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Sample_Name %in% subsampled.ML.tips.3pv.country.dates.sublin),c("Sample_Name","Cleaned_fastq_id","Sample_Year")] 


plot.subsampled.ML.tips.3pv.country.dates.sublin.tree <- ggtree(subsampled.ML.tips.3pv.country.dates.sublin.tree) %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage) + 
  geom_tippoint(aes(color=Sublineage), size=1, alpha=0.5) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) 

plot.subsampled.ML.tips.3pv.country.dates.sublin.tree <- gheatmap(plot.subsampled.ML.tips.3pv.country.dates.sublin.tree,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage), color=NULL,width=0.04, colnames_angle=0,colnames_offset_y=-1, font.size=2.5) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size + theme(legend.key.size = unit(0.5,"line"))

plot.subsampled.ML.tips.3pv.country.dates.sublin.tree
```
do root2tip for second subsampled tree
```{r}
p.subsampled.ML.tips.3pv.country.dates.sublin.root2tip <- plotRootToTip(subsampled.ML.tips.3pv.country.dates.sublin.tree, TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])


plot_grid(p.subsampled.ML.tips.3pv.country.dates.sublin.root2tip, p.subsampled.ML.tips.3pv.country.dates.sublin.root2tip + scale_x_continuous(limits = c(1400,2020)) + coord_cartesian(xlim=c(1400,2020), ylim=c(0,8.5e-5)), ncol=2)
```

Plot repeat subsetted tree and skyline info

```{r, fig.width=10, fig.height=10, message=FALSE, warning=FALSE}
repeat.subsampled.skyline.tree <- read.beast(repeat.subsampled.skyline.tree.file)
repeat.subsampled.skyline.tree.data <- data.frame(fortify(repeat.subsampled.skyline.tree),stringsAsFactors = F)

# BEAST tipnames have date included - lets remove that for plotting with metadata
repeat.subsampled.skyline.tree.tipnames <- data.frame(beast.name=repeat.subsampled.skyline.tree@phylo$tip.label,stringsAsFactors = F)
repeat.subsampled.skyline.tree.tipnames$meta.name <- gsub("\\|.+$","", repeat.subsampled.skyline.tree.tipnames$beast.name)
repeat.subsampled.skyline.tree@phylo$tip.label <- repeat.subsampled.skyline.tree.tipnames$meta.name

# Build plot
repeat.subsampled.skyline.plot1 <- ggtree(repeat.subsampled.skyline.tree,mrsd="2019-06-01",ladderize = T) + 
  theme_tree2() +
  # Add date lines for easy interpretation  
  scale_x_continuous(breaks=c(1300,1400,1500,1600,1700,1800,1850,1900,1925,1950,1975,2000,2020), minor_breaks=seq(1950, 2020, 5)) +
  theme(panel.grid.major   = element_line(color="grey50", size=.2),
        panel.grid.minor   = element_line(color="grey85", size=.2),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) 
# Add posterior support as node points
repeat.subsampled.skyline.plot1 <- repeat.subsampled.skyline.plot1 + geom_point2(aes(subset=(!isTip & as.numeric(posterior)>0.8)),color="gray60",size=2.5,alpha=0.5, shape=18) + 
  geom_point2(aes(subset=(!isTip & as.numeric(posterior)>0.91)),color="gray40",size=2.5,shape=18,alpha=0.5) + 
  geom_point2(aes(subset=(!isTip & as.numeric(posterior)>=0.96)),color="black",size=2.5,shape=18,alpha=0.5)

# Plot 95% HPD intervals - geom_range seems unable to do correctly with this tree (known bug for tip dated trees), so extract data and plot using geom_segment
rep.minmax <- t(matrix(unlist(repeat.subsampled.skyline.tree.data[!is.na(repeat.subsampled.skyline.tree.data$height_0.95_HPD),"height_0.95_HPD"]),nrow=2))
rep.bar_df <- data.frame(node_id=repeat.subsampled.skyline.tree.data[!is.na(repeat.subsampled.skyline.tree.data$height_0.95_HPD),"node"],as.data.frame(rep.minmax))
names(rep.bar_df) <- c('node_id','min','max') 
rep.bar_df <- rep.bar_df %>% filter(node_id > Ntip(repeat.subsampled.skyline.tree@phylo))
rep.bar_df <- rep.bar_df %>% left_join(repeat.subsampled.skyline.plot1$data, by=c('node_id'='node')) %>% select(node_id,min,max,y)
rep.mrcd.decimal <- decimal_date(as.Date("2019-06-01","%Y-%m-%d"))
repeat.subsampled.skyline.plot1 <- repeat.subsampled.skyline.plot1 + geom_segment(aes(x=rep.mrcd.decimal-max, y=y, xend=rep.mrcd.decimal-min, yend=y), data=rep.bar_df, color='red', alpha=0.2, size=2.25)
repeat.subsampled.skyline.plot1 <- repeat.subsampled.skyline.plot1 + new_scale_fill()

# Add markers after 2000
repeat.subsampled.skyline.plot1 <- repeat.subsampled.skyline.plot1 + geom_vline(xintercept = 2000, color='blue', alpha=0.5) + 
  annotate("rect",xmin=2000,xmax=2020,ymin=-1,ymax=170,alpha=0.1, fill='blue')

repeat.subsampled.skyline.plot1 + geom_tiplab(size=1.5, align=T)



```

```{r, fig.width=10, fig.height=10, message=FALSE, warning=FALSE}
# Skyline
rep.beast.subtree.skyline <- read.table(repeat.subsampled.skyline.data.file, sep="\t", check.names=F, comment.char="", header=T, stringsAsFactors=F)
p.rep.beast.subtree.skyline <- ggplot(rep.beast.subtree.skyline, aes(Time,Median)) + 
  geom_line() + geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
  theme_light() + 
  scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + scale_y_log10() + 
  coord_cartesian(x=c(1750,2020), y=c(50,3000)) + 
  theme.text.size + labs(y="Relative genetic diversity", x="Year") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) + annotate("rect",xmin=2000,xmax=2020,ymin=0,ymax=10000,alpha=0.1, fill='blue')

# Lineages
rep.beast.subtree.skyline.lineages <- read.table(repeat.subsampled.skyline.lineages.data.file, sep="\t", check.names=F, comment.char="", header=T, stringsAsFactors=F)
p.rep.beast.subtree.skyline.lineages <- ggplot(rep.beast.subtree.skyline.lineages, aes(Time,Median)) + 
  geom_line() + geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
  theme_light() + 
  scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + scale_y_log10() + 
  coord_cartesian(x=c(1750,2020), y=c(1,300)) + 
  theme.text.size + labs(y="Lineages", x="Year") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) + annotate("rect",xmin=2000,xmax=2020,ymin=0,ymax=500,alpha=0.1, fill='blue')
#p.rep.beast.subtree.skyline.lineages



# Make combined plot
rep.skyline.row <- plot_grid(NULL,p.rep.beast.subtree.skyline,NULL, ncol=3, rel_widths = c(1,3,1))
rep.lineage.row <- plot_grid(NULL,p.rep.beast.subtree.skyline.lineages,NULL, ncol=3, rel_widths = c(1,3,1))
#rep.both.skyline.lineage.rows <- plot_grid(rep.skyline.row, rep.lineage.row, align=T, ncol=1, labels=c('B','C'))
rep.both.skyline.lineage.rows <- plot_grid(rep.skyline.row, align=T, ncol=1, labels=c('B'),label_size=11)

#plot.beast.with.skyline.repeat <- plot_grid(repeat.subsampled.skyline.plot1 + theme(legend.position='none'), rep.both.skyline.lineage.rows, ncol=1, align=T, rel_heights = c(2,2),labels=c('A',''), label_size=11)

plot.beast.with.skyline.repeat <- plot_grid(repeat.subsampled.skyline.plot1 + theme(legend.position='none'), rep.both.skyline.lineage.rows, ncol=1, align=T, rel_heights = c(3,1),labels=c('A',''), label_size=11)



#plot.beast.with.skyline <- plot_grid(TPA.beast.plot1.meta + theme(legend.position='none'), skyline.row, ncol=1, align=T, rel_heights = c(2,1),labels=c('A','B'), label_size=11)
#beast.legend.combined <- plot_grid(beast.year.legend, NULL, ncol=1, rel_heights=c(3,1))


#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure17_subsample2-168_BEAST_StrictCSkyline_02-2021.svg"), width = 800, height = 800,type="svg",units = "pt")
plot.beast.with.skyline.repeat 
#dev.off()

```

Plotting Root-to-tip
```{r}
# Define function
plot.subtree.R2T <- function(maintree, tips, dates.df){
  subtree.temp  <- ape::keep.tip(maintree, as.character(tips))
  plotRootToTip(subtree.temp, dates.df[,c("Sample_Name","Sample_Year")])
  plot_grid(ggtree(subtree.temp),plotRootToTip(subtree.temp, dates.df[,c("Sample_Name","Sample_Year")]),ncol=2)
}

```


Full tree dataset for BEAST1/2:
```{r}
full.beast2.tree <- read.beast(full.beast2.tree.file)
tree.subsampled.ML.tips.full.data.for.beast2 <- (ape::keep.tip(TPA.MLtree, gsub("\\|.+$","",full.beast2.tree@phylo$tip.label, perl =T)))

tree.subsampled.ML.tips.full.data.for.beast2.r2t <- plot.subtree.R2T(tree.subsampled.ML.tips.full.data.for.beast2, gsub("\\|.+$","",full.beast2.tree@phylo$tip.label), TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])


#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure18__ML-tree.fulltree520-with-root2tip_02-2021.svg"), width = 800, height = 600,type="svg",units = "pt")
tree.subsampled.ML.tips.full.data.for.beast2.r2t
#dev.off()

```



\
Pull in data for full BEAST2 tree \
\

Full Beast tree
```{r, fig.width=12, fig.height=12, message=FALSE, warning=FALSE}
full.beast2.tree <- read.beast(full.beast2.tree.file)
full.beast2.tree.data <- data.frame(fortify(full.beast2.tree),stringsAsFactors = F)


# Lets sort out the date of various events - relate everything in years
full.beast2.tree.data$mrca.median <- 2019.5 - full.beast2.tree.data$height_median
full.beast2.tree.data$year <- as.numeric(round(2019.5 - full.beast2.tree.data$height_median,0))

full.beast2.tree.data$mrca.95high <- round(2019.5 - sapply(1:nrow(full.beast2.tree.data),function(x) as.numeric(unlist(full.beast2.tree.data[x,"height_0.95_HPD"]))[1]))

full.beast2.tree.data$mrca.95low <- round(2019.5 - sapply(1:nrow(full.beast2.tree.data),function(x) as.numeric(unlist(full.beast2.tree.data[x,"height_0.95_HPD"]))[2]))


# Join metadata
full.beast2.tree.data$Sample_Name <- gsub("\\|.+$","", full.beast2.tree.data$label)
full.beast2.tree.data.meta <- plyr::join(full.beast2.tree.data, TPA.meta1.2.pinecone[,c("Sample_Name", "Geo_Country","TPA.pinecone.sublineage","Sample_Year")], by="Sample_Name", type="left")

full.beast2.tree.data.meta <- full.beast2.tree.data.meta[full.beast2.tree.data.meta$isTip==T,]
```



Extract some key dates

```{r}
#####
#############full.beast2.tree.data.tip <- full.beast2.tree.data[full.beast2.tree.data$isTip]
######
Expanded.sublineages <- data.frame(Sublineage=c(1,2,8,14), stringsAsFactors = F)

# Run loop to extract MRCA node for each sublineage
Expanded.sublineage.nodes <- NULL
for (current.sublineage.exp1 in Expanded.sublineages$Sublineage) {
  Expanded.sublineage.nodes <- c(Expanded.sublineage.nodes, ape::getMRCA(as.phylo(full.beast2.tree),as.character(full.beast2.tree.data.meta[full.beast2.tree.data.meta$TPA.pinecone.sublineage==current.sublineage.exp1,"Sample_Name"])))
}
Expanded.sublineages$node <- Expanded.sublineage.nodes


# Plot tree with node labels (to visualise and identify key nodes)
ggtree(full.beast2.tree) + geom_text2(aes(subset=!isTip, label=node), hjust=-.3, size=2.5, color="blue") 

# Split sublineage 1 in this tree creates a problem. Change it to 528 (MRCA of main clade)
Expanded.sublineages[Expanded.sublineages$Sublineage==1,"node"] <- 528



# Extract dates for the MRCAs - when did those sublineages first arise? Pretty old!
plyr::join(Expanded.sublineages, full.beast2.tree.data[,c("node","mrca.median","year","mrca.95high","mrca.95low")], by="node")


ggtree(full.beast2.tree,mrsd="2019-06-01",ladderize = T) + geom_text2(aes(subset=!isTip, label=node), hjust=-.3, size=2) +
  geom_point2(aes(subset=(node %in% Expanded.sublineages$node)),color="blue") +
  ggtitle("Red nodes indicate MRCA for each sublineage")

```

Lets plot the BEAST tree
```{r, fig.width=12, fig.height=12, message=FALSE, warning=FALSE}
# BEAST tipnames have date included - lets remove that for plotting with metadata
full.beast2.tipnames <- data.frame(beast.name=full.beast2.tree@phylo$tip.label,stringsAsFactors = F)
full.beast2.tipnames$meta.name <- gsub("\\|.+$","", full.beast2.tipnames$beast.name)
full.beast2.tree@phylo$tip.label <- full.beast2.tipnames$meta.name
#TPA.beast.tree@phylo$tip.label


# Build plot
full.beast2.tree.plot1 <- ggtree(full.beast2.tree,mrsd="2019-06-01",ladderize = T) + 
  theme_tree2() +
  # Add date lines for easy interpretation  
  scale_x_continuous(breaks=c(1300,1400,1500,1600,1700,1800,1850,1900,1925,1950,1975,2000,2020), minor_breaks=seq(1950, 2020, 5)) +
  theme(panel.grid.major   = element_line(color="grey50", size=.2),
        panel.grid.minor   = element_line(color="grey85", size=.2),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) 
# Add posterior support as node points
full.beast2.tree.plot1 <- full.beast2.tree.plot1 + geom_point2(aes(subset=(!isTip & as.numeric(posterior)>0.8)),color="gray60",size=2.5,alpha=0.5, shape=18) + 
  geom_point2(aes(subset=(!isTip & as.numeric(posterior)>0.91)),color="gray40",size=2.75,shape=18,alpha=0.5) + 
  geom_point2(aes(subset=(!isTip & as.numeric(posterior)>=0.96)),color="black",size=2.75,shape=18,alpha=0.5) +
    geom_rootedge(rootedge=50)


# Plot 95% HPD intervals - geom_range seems unable to do correctly with this tree (known bug for tip dated trees), so extract data and plot using geom_segment
minmax.fulltree <- t(matrix(unlist(full.beast2.tree.data[!is.na(full.beast2.tree.data$height_0.95_HPD),"height_0.95_HPD"]),nrow=2))
bar_df.fulltree <- data.frame(node_id=full.beast2.tree.data[!is.na(full.beast2.tree.data$height_0.95_HPD),"node"],as.data.frame(minmax.fulltree))
names(bar_df.fulltree) <- c('node_id','min','max') 
bar_df.fulltree <- bar_df.fulltree %>% filter(node_id > Ntip(full.beast2.tree@phylo))
bar_df.fulltree <- bar_df.fulltree %>% left_join(full.beast2.tree.plot1$data, by=c('node_id'='node')) %>% select(node_id,min,max,y)
mrcd.decimal.fulltree <- decimal_date(as.Date("2019-06-01","%Y-%m-%d"))
full.beast2.tree.plot1 <- full.beast2.tree.plot1 + geom_segment(aes(x=mrcd.decimal.fulltree-max, y=y, xend=mrcd.decimal.fulltree-min, yend=y), data=bar_df.fulltree, color='red', alpha=0.2, size=1.5)

full.beast2.tree.plot1 <- full.beast2.tree.plot1 + new_scale_fill()
#full.beast2.tree.plot1 #+ geom_tiplab(size=2, align=T)

# Add Metadata
full.beast2.tree.plot1.meta <- gheatmap(full.beast2.tree.plot1,
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage), color=NULL,width=0.05, offset=2,colnames_angle=0,colnames_offset_y=-1.5, font.size=2.25,) + 
  scale_fill_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  theme.text.size
full.beast2.tree.plot1.meta <- full.beast2.tree.plot1.meta + new_scale_fill()

full.beast2.tree.plot1.meta <- gheatmap(full.beast2.tree.plot1.meta,TPA.rawseq.countries.p, color=NULL,width=0.05,offset=36, colnames_angle=0,colnames_offset_y=-1.5, font.size=2.25) + 
  scale_fill_manual(name="Country",values=continental.country.cols.brew2$country.col, breaks=continental.country.cols.brew2$Geo_Country) +
  theme.text.size
full.beast2.tree.plot1.meta <- full.beast2.tree.plot1.meta + new_scale_fill()
#beast.country.legend <- get_legend(TPA.beast.plot1.meta + theme(legend.key.size = unit(0.65,"line"),legend.position='right'))

full.beast2.tree.plot1.meta <- gheatmap(full.beast2.tree.plot1.meta,data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Year=TPA.meta1.2.pinecone$Sample_5year.window, stringsAsFactors = F), color=NULL,width=0.05, offset=70,colnames_angle=0,colnames_offset_y=-1.5, font.size=2.25) + scale_fill_manual(name="Year",values=TPA.5year.window.brewcols$window.5year.cols[1:(length(TPA.5year.window.brewcols$window.5year.cols)-1)], breaks=TPA.5year.window.brewcols$window.5year[1:(length(TPA.5year.window.brewcols$window.5year)-1)]) +
  theme.text.size + theme(legend.key.size = unit(0.55,"line"), legend.position="left") +
  coord_cartesian(y=c(-5,522)) +
  NULL
full.beast2.tree.plot1.meta <- full.beast2.tree.plot1.meta + new_scale_fill()
#full.beast2.tree.plot1.meta.legend <- get_legend(full.beast2.tree.plot1.meta + theme(legend.key.size = unit(0.65,"line"),legend.position='right'))


full.beast2.tree.plot1.meta <- full.beast2.tree.plot1.meta + geom_vline(xintercept = 2000, color='blue', alpha=0.5)
full.beast2.tree.plot1.meta <- full.beast2.tree.plot1.meta + annotate("rect",xmin=2000,xmax=2020,ymin=-4,ymax=521,alpha=0.1, fill='blue')


full.beast2.tree.plot1.meta #+ geom_text2(aes(subset=!isTip, label=node), hjust=-.3, size=2)
```


Pull in skyline analysis for full beast2 tree
```{r}
sublineage.skylines.filepath <- beast2.runs.filepath

beast2.full.skyline <- read.table(paste0(beast2.runs.filepath,beast2.full.skyline.path), sep="\t", header=T)
beast2.full.lineages <- read.table(paste0(beast2.runs.filepath,beast2.full.lineages.path), sep="\t", header=T)

# Chris Ruis' script to extract distribution of trees supporting expansion above baseline
# In this analysis, looked for 2-fold expansion (-p 100), in which 0.6745 (67.5%) supported expansion
beast2.full.popdistro <- read.table(paste0(beast2.runs.filepath,beast2.full.popdistro.path), sep="\t", header=T)


# plot skyline
beast2.full.skyline.plot <- ggplot(beast2.full.skyline, aes(Time,Median)) + 
  geom_line() + geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
  theme_light() + 
  scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + 
  scale_y_log10() + 
  coord_cartesian(x=c(1740,2020),y=c(30,3000)) + 
  theme.text.size + labs(y="Relative genetic\ndiversity", x="Year") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.y=element_text(color = "grey25",angle=0, size=10))


# Plot 'lineages through time'
beast2.full.lineages.plot <- ggplot(beast2.full.lineages, aes(Time,Median)) + 
  geom_line() + geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
  theme_light() + 
  scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + 
  scale_y_log10() + 
  coord_cartesian(x=c(1740,2020),y=c(1,1000)) + 
  theme.text.size + labs(y="Inferred Lineages", x="Year") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.y=element_text(color = "grey25",angle=0, size=10))


# Plot population expansion date distro
beast2.full.popdistros.starts <- ggplot(beast2.full.popdistro[beast2.full.popdistro$Increase_date>0,], aes("Full Tree",Increase_date)) +
  geom_sina(alpha=0.75,color="grey80", size=1) + 
  geom_boxplot(alpha=0.0,outlier.shape = NA, width=0.25) +
  coord_cartesian(ylim=c(1740,2020)) +
  scale_y_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01), limits = c(1740,2020)) + 
  theme_light() + 
  coord_flip() + theme.text.size +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.y=element_text(color = "grey25",angle=0, size=10)) +
  theme(axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
  labs(y="Year", x="Distribution density\n(start of 2-fold\npopulation expansion)") + 
  geom_hline(yintercept = 2000, color='blue', alpha=0.5) +
  NULL

# alternate plot type
beast2.full.popdistros.starts.densiplot <- ggplot(beast2.full.popdistro[beast2.full.popdistro$Increase_date>0,], aes(x=Increase_date)) + 
  geom_density() +
  coord_cartesian(xlim=c(1740,2020)) +
  #scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01), limits=c(1740,2020)) + 
  scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + 
  theme_light() + 
  theme.text.size + 
  labs(x="Year", y="Distribution density\n(start of 2-fold\npopulation expansion)") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) +
  NULL
  



#plot_grid(beast2.full.skyline.plot,beast2.full.lineages.plot, ncol=1, align=T)
#plot_grid(beast2.full.skyline.plot + x.theme.strip, beast2.full.lineages.plot + x.theme.strip, beast2.full.popdistros.starts, ncol=1, align=T)

#plots.beast2.full.skyline.distro.combi <- plot_grid(beast2.full.skyline.plot + x.theme.strip, beast2.full.lineages.plot + x.theme.strip, beast2.full.popdistros.starts.densiplot, ncol=1, align='v',axis='t', labels = c('B','C','D'), label_size = 11,  label_x=0, label_y=1, scale=0.95)

plots.beast2.full.skyline.distro.combi <- plot_grid(beast2.full.skyline.plot + x.theme.strip, beast2.full.popdistros.starts.densiplot, ncol=1, align='v',axis='t', labels = c('B','C'), label_size = 11,  label_x=0, label_y=1, scale=0.95)


plots.beast2.full.skyline.distro.combi
```



Make combined plot (beast2 full + skylines)
```{r, fig.width=12, fig.height=12, message=FALSE, warning=FALSE}
#arrange.plots.beast2.full.skyline.distro.combi <- plot_grid('',plots.beast2.full.skyline.distro.combi, '', ncol=1, rel_heights=c(1,4,2))
arrange.plots.beast2.full.skyline.distro.combi <- plot_grid('',plots.beast2.full.skyline.distro.combi, '', ncol=1, rel_heights=c(1,4,4))


# Plot full BEAST tree with metadata strips
plot.full.beast2.with.skyline.distros <- plot_grid(full.beast2.tree.plot1.meta,arrange.plots.beast2.full.skyline.distro.combi,ncol=2, rel_widths=c(4,2), labels=c('A',''), label_size=11)

# Plot full BEAST tree without metadata
#plot.full.beast2.with.skyline.distros <- plot_grid(full.beast2.tree.plot1 + geom_vline(xintercept = 2000, color='blue', alpha=0.5) + annotate("rect",xmin=2000,xmax=2020,ymin=-4,ymax=521,alpha=0.1, fill='blue'), arrange.plots.beast2.full.skyline.distro.combi,ncol=2, rel_widths=c(4,2), labels=c('A',''), label_size=11)


#Cairo::Cairo(file=paste0(Figure_output_directory, "Figure3_Full-beast2+_skyline-+pop-expansion__02-2021.svg"), width = 1500, height = 1000,type="svg",units = "pt")
plot.full.beast2.with.skyline.distros
#dev.off()


```





# Look at explicit support for population decline and expansion within the Beast2 tree distribution \

Extract trees supporting a population 2-fold decline within a defined window (1990-2005) using Chris Ruis' script:\
`python3 /nfs/users/nfs_m/mb29/scripts/population_change_support_BEAST.py -t TPA-uber_beast2_strict-skyline-500M_10pop_combined.trees -l TPA-uber_beast2_strict-skyline-500M_10pop_combined.log -p 50 -w 1990 2005 --decrease -d 2019.5 -b 2 -o beast2_strict-skyline-500M_10pop_pop-decline.1990-2005.p50` \
\
Analysis shows 82.8% of trees (11191/ ) support a population decline between 1990-2005. \
\

Extract trees supporting a 2-fold population increase within a defined window (2000-2015) using Chris Ruis' script:\
`python3 /nfs/users/nfs_m/mb29/scripts/population_change_support_BEAST.py -t TPA-uber_beast2_strict-skyline-500M_10pop_combined.trees -l TPA-uber_beast2_strict-skyline-500M_10pop_combined.log -p 100 -w 2000 2015 -d 2019.5 -b 2` \
\
Analysis shows 83.3% of trees (11245/ ) support a population increase between 2000-2015. \


To be consistent with the 'decline' plot, extracted trees supporting a 2-fold population increase within a defined window (1990-2015) using Chris Ruis' script:\
`python3 /nfs/users/nfs_m/mb29/scripts/population_change_support_BEAST.py -t TPA-uber_beast2_strict-skyline-500M_10pop_combined.trees -l TPA-uber_beast2_strict-skyline-500M_10pop_combined.log -p 100 -w 1990 2015 -d 2019.5 -b 2` \
\
Analysis shows 58.5% of trees (11245/ ) support a population increase between 1990-2015. \
\
\
Plot distributions

```{r}
beast2.pop.decline <- read.csv(beast2.pop.decline.file, header=T, stringsAsFactors=F)
beast2.pop.increase <- read.csv(beast2.pop.increase.file, header=T, stringsAsFactors=F)

beast2.pop.decline$query <- "Decline"
beast2.pop.increase$query <- "Expansion"
beast2.pop.decline.increase <- rbind(beast2.pop.decline,beast2.pop.increase)

# Create dataframe with supporting values for text plotting
#decline.support.value <- 82.8
#increase.support.value <- 83.3
decline.support.value <- "90.7"
increase.support.value <- "59.0"



beast.decline.increase.support.values <- data.frame(query=c("Decline","Expansion"),proportion=c(decline.support.value,increase.support.value), median.date=c(median(beast2.pop.decline$Date_of_change), median(beast2.pop.increase$Date_of_change)), stringsAsFactors=F)


p.beast2.full.popdecline.increase.densiplot <- ggplot(beast2.pop.decline.increase, aes(x=Date_of_change, group=query, color=query, fill=query)) + 
  geom_density(alpha=0.25) +
  scale_x_continuous(expand=c(0.01,0.01)) + 
  theme_light() + 
  theme.text.size + 
  labs(x="Year", y="Posterior distribution density\n(start of 2-fold population decline/expansion)", color="Key") + 
  guides(fill=FALSE) +
  geom_text(data=beast.decline.increase.support.values, aes(x=median.date, y=0.45, label=paste0(proportion,"% of supporting trees")),size=3) + 
  geom_text(data=beast.decline.increase.support.values, aes(x=median.date, y=0.5, label=paste0("median date: ",round(median.date,1))),size=3) + 
  theme(legend.position = 'top') 
  #geom_vline(data=beast.decline.increase.support.values, aes(xintercept=median.date, color=query)) +
  NULL



#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure8__Skyline-pop-decline+expansion-dates__02-2021.svg"), width = 450, height = 300,type="svg",units = "pt")  
p.beast2.full.popdecline.increase.densiplot
#dev.off()



```

Look at trees supporting or not-supporting expansion
```{r}
pop.decline.supporting.trees <- read.nexus(pop.decline.supporting.trees.file)
pop.decline.supporting.trees.sample25 <-sample(pop.decline.supporting.trees,size=25)

pop.decline.notsupporting.trees <- read.nexus(pop.decline.notsupporting.trees.file)
pop.decline.notsupporting.trees.sample25 <-sample(pop.decline.notsupporting.trees,size=25)

TPA.meta1.2.pinecone$Sample_Name.dates <- paste0(TPA.meta1.2.pinecone$Sample_Name,"|",TPA.meta1.2.pinecone$Sample_Year)


p.pop.decline.notsupporting.trees.sample25 <- ggdensitree(pop.decline.notsupporting.trees.sample25, alpha=0.1,mrsd="2019-06-01") + 
  theme_tree2()
p.pop.decline.notsupporting.trees.sample25 <- p.pop.decline.notsupporting.trees.sample25 %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name.dates, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage) + 
  geom_tippoint(aes(color=Sublineage), size=1, alpha=0.5) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  # Add date lines for easy interpretation  
  scale_x_continuous(breaks=c(1300,1400,1500,1600,1700,1800,1850,1900,1925,1950,1975,2000,2020), minor_breaks=seq(1950, 2020, 5)) +
  theme(panel.grid.major   = element_line(color="grey50", size=.2),
        panel.grid.minor   = element_line(color="grey85", size=.2),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) + 
  ggtitle("Trees not supporting population expansion") +
  theme(plot.title = element_text(size = 9))
#p.pop.decline.notsupporting.trees.sample25


p.pop.decline.supporting.trees.sample25 <- ggdensitree(pop.decline.supporting.trees.sample25, alpha=0.1,mrsd="2019-06-01") + 
  theme_tree2()
p.pop.decline.supporting.trees.sample25 <- p.pop.decline.supporting.trees.sample25 %<+% data.frame(Sample_Name=TPA.meta1.2.pinecone$Sample_Name.dates, Sublineage=TPA.meta1.2.pinecone$TPA.pinecone.sublineage) + 
  geom_tippoint(aes(color=Sublineage), size=1, alpha=0.5) + 
  scale_color_manual(name="Sublineage",values=sublineages.cols.brew$sublineage.cols, breaks=sublineages.cols.brew$sublineage) +
  # Add date lines for easy interpretation  
  scale_x_continuous(breaks=c(1300,1400,1500,1600,1700,1800,1850,1900,1925,1950,1975,2000,2020), minor_breaks=seq(1950, 2020, 5)) +
  theme(panel.grid.major   = element_line(color="grey50", size=.2),
        panel.grid.minor   = element_line(color="grey85", size=.2),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) +
  ggtitle("Trees supporting population expansion") +
  theme(plot.title = element_text(size = 9))
#p.pop.decline.supporting.trees.sample25



plot_grid(p.pop.decline.supporting.trees.sample25, p.pop.decline.notsupporting.trees.sample25, ncol=1)
```
There are no obvious differences in tree topology here. It is likely that these trees either show expansion <2-fold, or the baseline used to average is affecting expansion - e.g. if the baseline was higher at the start of the decline, expansion would not be detected. 




\
Extract individual sublineages for Bayesian Skyline analysis (only take sublineages of decent size, with no outgroups)\

```{r}
major.multicountry.sublineages <- sublineage.counts[sublineage.counts$Count>10,]
major.multicountry.sublineages
sublineage.1.meta <- TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage==1,c("Cleaned_fastq_id","Sample_Name","Sample_Year")]
sublineage.2.meta <- TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage==2,c("Cleaned_fastq_id","Sample_Name","Sample_Year")]
#sublineage.4.meta <- TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage==4,c("Cleaned_fastq_id","Sample_Name","Sample_Year")]
sublineage.8.meta <- TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage==8,c("Cleaned_fastq_id","Sample_Name","Sample_Year")]
sublineage.14.meta <- TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$TPA.pinecone.sublineage==14,c("Cleaned_fastq_id","Sample_Name","Sample_Year")]

all.pinecone.meta <- TPA.meta1.2.pinecone.havedates[,c("Cleaned_fastq_id","Sample_Name","Sample_Year")]
```


Check temporal signal by extracting subtrees
```{r, fig.width=12, fig.height=12, message=FALSE, warning=FALSE}

p.sublin.1.r2t <- plot.subtree.R2T(TPA.MLtree,sublineage.1.meta$Sample_Name,TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])
p.sublin.2.r2t <- plot.subtree.R2T(TPA.MLtree,sublineage.2.meta$Sample_Name,TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])
p.sublin.8.r2t <- plot.subtree.R2T(TPA.MLtree,sublineage.8.meta$Sample_Name,TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])
p.sublin.14.r2t <- plot.subtree.R2T(TPA.MLtree,sublineage.14.meta$Sample_Name,TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])



#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure10_Sublineages_trees+root2tip__02-2021.svg"), width = 1000, height = 1000,type="svg",units = "pt")
plot_grid(p.sublin.1.r2t,p.sublin.2.r2t,p.sublin.8.r2t,p.sublin.14.r2t,ncol=2, labels=c('Sublineage 1','Sublineage 2','Sublineage 8','Sublineage 14'),scale = 0.85,label_size=12)
#dev.off()

```

# Evaluate subtree skyline analyses \

Sublineage 1 (new)\
Seqs 365, sites 278, span 1981-2019\
- Good convergence in all traces, strong ESS for most variables, but lower for a few of the skyline groups (ESS>100, apart from Skyline.Groupsize10 with ESS 79) \
- clock rate 1.34e-7  \

Sublineage 2 (new)\
Seqs 32, sites 36, span 2000-2019\
- Good convergence in all traces, very strong ESS in all cases\
- Skyline strong expanding skyline plot \
- clock rate 1.57e-7  \

Sublineage 8 (new)\
Seqs 15, sites 31, span 1986-2019\
- Good convergence in all traces, very strong ESS in all cases\
- Skyline - good skyline, showing possible expansion during 90s, possible contraction in 2010s) \
- clock rate 1.80e-7  \

Sublineage 14 (new)\
Seqs 55, sites 23, span 2013-2019\
- did not converge (at all) \
- No temporal signal (in r2t plots) \
\
\


\
\


Define some functions to plot
```{r}


plot.skyline.data <- function(skyline.file,lineages.file){  
  skyline.data <- read.table(paste0(sublineage.skylines.filepath,skyline.file), header=T, sep="\t", comment.char="",stringsAsFactors=F)
  skyline.plot <- ggplot(skyline.data, aes(Time,Median)) + 
    geom_line() + geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
    theme_light() + 
    #scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + 
    scale_x_continuous(breaks=c(seq(1980,2020,20)), expand=c(0.01,0.01)) + 
    scale_y_log10() + 
    coord_cartesian(x=c(1980,2020)) + 
    theme.text.size + labs(y="Relative genetic diversity", x="Year") + 
    geom_vline(xintercept = 2000, color='blue', alpha=0.5) + annotate("rect",xmin=2000,xmax=2020,ymin=0,ymax=10000,alpha=0.1, fill='blue')
  
  # Lineages
  skyline.lineages <- read.table(paste0(sublineage.skylines.filepath,lineages.file), header=T, sep="\t", comment.char="",stringsAsFactors=F)
  lineages.plot  <- ggplot(skyline.lineages, aes(Time,Median)) + 
    geom_line() + geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
    theme_light() + 
    scale_x_continuous(breaks=c(seq(1980,2020,20)), expand=c(0.01,0.01)) + 
    #scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + 
    scale_y_log10() + 
    coord_cartesian(x=c(1980,2020), y=c(1,300)) + 
    #scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + 
    theme.text.size + labs(y="Lineages", x="Year") + 
    geom_vline(xintercept = 2000, color='blue', alpha=0.5) + annotate("rect",xmin=2000,xmax=2020,ymin=0,ymax=500,alpha=0.1, fill='blue')
  
  skyline.combined.plot <- plot_grid(skyline.plot,lineages.plot, ncol=1, align=T)
  return(skyline.combined.plot)
  #return(skyline.plot)
}


```


# Plot sublineage skylines with lineages through time - all together
```{r, fig.width=6, fig.height=8, message=FALSE, warning=FALSE}

sublineage.skylines.filepath <- "/Users/mb29/Treponema/Expanded_Global_Sequencing/Analysis/Global_TPA_uber-analysis_2020_v2/phylo/good_cov_dataset/gubbins_2.4.1__20-11-10/beast/individual_sublineages_reclassified__2021-02-04/beauti/outputs/"
  
collected.skylines <- NULL
for (sublineage in c(1,2,8)) {
  sublineage.skyline <- read.table(paste0(sublineage.skylines.filepath,"TPA-uber.remasked.2020-11-10.goodcov25.gubbins.WGS.sublineage.new.",sublineage,".noinv.Strict-Skyline_combined.skyline-data.tsv"), header=T, sep="\t", comment.char="",stringsAsFactors=F)
  sublineage.skyline$sublineage <- sublineage 
  collected.skylines <- rbind(collected.skylines, sublineage.skyline)
}
beast2.full.skyline$sublineage <- "All"
collected.skylines.inc.full <- rbind(collected.skylines, beast2.full.skyline)
collected.skylines.inc.full$sublineage <- factor(collected.skylines.inc.full$sublineage, levels=unique(collected.skylines.inc.full$sublineage))

sublineage.skyline.combi.plot <- ggplot(collected.skylines.inc.full, aes(Time,Median)) + 
  geom_line() + geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
  theme_light() + 
  #scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + 
  scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + 
  scale_y_log10() + 
  coord_cartesian(x=c(1900,2020),y=c(1,3000)) + 
  theme.text.size + labs(y="Relative genetic diversity (per sublineage)", x="Year") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) + #annotate("rect",xmin=2000,xmax=2020,ymin=0,ymax=10000,alpha=0.1, fill='blue') + 
  facet_grid(sublineage~.) +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.y=element_text(color = "grey25",angle=0, size=10))


# Do same for 'lineages through time'
collected.skylines.lineages <- NULL
for (sublineage in c(1,2,8)) {
  sublineage.skyline.lineages <- read.table(paste0(sublineage.skylines.filepath,"TPA-uber.remasked.2020-11-10.goodcov25.gubbins.WGS.sublineage.new.",sublineage,".noinv.Strict-Skyline_combined.lineages-data.tsv"), header=T, sep="\t", comment.char="",stringsAsFactors=F)
  sublineage.skyline.lineages$sublineage <- sublineage 
  collected.skylines.lineages <- rbind(collected.skylines.lineages, sublineage.skyline.lineages)
}
beast2.full.lineages$sublineage <- "All"
collected.skylines.lineages.inc.full <- rbind(collected.skylines.lineages, beast2.full.lineages)
collected.skylines.lineages.inc.full$sublineage <- factor(collected.skylines.lineages.inc.full$sublineage, levels=unique(collected.skylines.lineages.inc.full$sublineage))


sublineage.skyline.lineages.combi.plot <- ggplot(collected.skylines.lineages.inc.full, aes(Time,Median)) + 
  geom_line() + geom_ribbon(aes(ymin=Lower, ymax=Upper), alpha=0.2, fill='black') +
  theme_light() + 
  scale_x_continuous(breaks=c(seq(1980,2020,10)), expand=c(0.01,0.01)) + 
  scale_y_log10() + 
  coord_cartesian(x=c(1980,2020)) + 
  theme.text.size + labs(y="Inferred Lineages (per sublineage)", x="Year") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) + #annotate("rect",xmin=2000,xmax=2020,ymin=0,ymax=10000,alpha=0.1, fill='blue') + 
  facet_grid(sublineage~.) +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.y=element_text(color = "grey25",angle=0, size=10))


#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure9_sublineages_skylines__02-2021.svg"), width = 300, height = 400,type="svg",units = "pt")
sublineage.skyline.combi.plot
#dev.off()
```




Look at Chris Ruis's tool for extracting the date distributino of a population size increase\
Run as: \
`python3 ~/scripts/population_increase_distribution_BEAST.py -b1 -t TPA-uber.remasked.2020-11-10.goodcov25.gubbins.WGS.sublineage.12.Strict-Skyline_combined.trees -l TPA-uber.remasked.2020-11-10.goodcov25.gubbins.WGS.sublineage.12.Strict-Skyline_combined.log -d 2019.5 -o TPA-uber.remasked.2020-11-10.goodcov25.gubbins.WGS.sublineage.12.Strict-Skyline_combined.pop-distributions.txt`
\

All pop distros
```{r}
# "TPA-uber_beast2_strict-skyline-500M_10pop_combined.pop-distributions_p100.txt"
pop.distro.subsampled.all.file <- beast2.full.popdistro.path 


# Do same for 'lineages through time'
collected.skyline.popdistros <- NULL
for (sublineage in c(1,2,8)) {
  skyline.popdistro <- read.table(paste0(pop.distro.path,"TPA-uber.remasked.2020-11-10.goodcov25.gubbins.WGS.sublineage.new.",sublineage,".noinv.Strict-Skyline_combined.pop-expansion"), header=T, sep="\t", comment.char="",stringsAsFactors=F)
  skyline.popdistro$sublineage <- sublineage 
  collected.skyline.popdistros <- rbind(collected.skyline.popdistros, skyline.popdistro)
}
# do 'All' separately
sublineage.all.skyline.pop.distro <- read.table(paste0(pop.distro.path,pop.distro.subsampled.all.file), header=T, sep="\t", comment.char="",stringsAsFactors=F)
sublineage.all.skyline.pop.distro$sublineage <- "All"
collected.skyline.popdistros <- rbind(collected.skyline.popdistros,sublineage.all.skyline.pop.distro)


collected.skyline.popdistros$sublineage <- factor(collected.skyline.popdistros$sublineage, levels=unique(collected.skyline.popdistros$sublineage))

x.theme.strip.partial <- theme(axis.text.x = element_blank(), axis.ticks.x= element_blank())


plot.collected.skyline.popdistros.starts <- ggplot(collected.skyline.popdistros[collected.skyline.popdistros$Increase_date>0,], aes(sublineage,Increase_date)) +
  geom_sina(alpha=0.2,color="grey80", size=1) + 
  geom_boxplot(alpha=0.0,outlier.shape = NA, width=0.25) +
  coord_cartesian(ylim=c(1900,2020)) + 
  theme_light() + 
  ylim(1940,2020) +
  facet_grid(sublineage~., scales="free_y") +
  coord_flip() + theme.text.size +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.y=element_text(color = "grey25",angle=0, size=10)) +
  #y.theme.strip +
  theme(axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
  labs(y="Year", x="Distribution density (start date for 5-fold population expansion)") + 
  geom_hline(yintercept = 2000, color='blue', alpha=0.5) +
  NULL
#plot.collected.skyline.popdistros.starts


plot.collected.skyline.popdistros.starts.density <- ggplot(collected.skyline.popdistros[collected.skyline.popdistros$Increase_date>0,], aes(Increase_date)) + 
  geom_density() +
  coord_cartesian(xlim=c(1900,2020)) +
  #scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01), limits=c(1740,2020)) + 
  scale_x_continuous(breaks=c(seq(1740,2020,20)), expand=c(0.01,0.01)) + 
  facet_grid(sublineage~., scales="free_y") +
  #facet_grid(sublineage~.) +
  theme_light() + 
  theme.text.size + 
  #theme(axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.y=element_text(color = "grey25",angle=0, size=10)) +
  labs(x="Year", y="Distribution density (start of 2-fold population expansion)") + 
  geom_vline(xintercept = 2000, color='blue', alpha=0.5) +
  NULL
plot.collected.skyline.popdistros.starts.density

```


```{r, fig.width=12, fig.height=10, message=FALSE, warning=FALSE}

plot_grid(sublineage.skyline.combi.plot,  sublineage.skyline.lineages.combi.plot, plot.collected.skyline.popdistros.starts.density,ncol=3, align=T, axis="ltb",rel_widths=c(4,2,3), labels=c('A - Genetic Diversity','B - Lineages','C - Population Expansion'), label_size=11,label_y=1, label_x=0.01, scale=0.95) 

```

\


#Want to look at distributions within single countries

Extract UK and Canada (all) subtrees
```{r}

subtree.UK.meta <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Geo_Country=="UK"),c("Cleaned_fastq_id","Sample_Name","Sample_Year")]
p.subtree.UK.r2t <- plot.subtree.R2T(TPA.MLtree,subtree.UK.meta$Sample_Name,TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])

subtree.BC.meta <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Geo_Region=="British_Columbia"),c("Cleaned_fastq_id","Sample_Name","Sample_Year")]
p.subtree.BC.r2t <- plot.subtree.R2T(TPA.MLtree,subtree.BC.meta$Sample_Name,TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])

plot_grid(p.subtree.UK.r2t, p.subtree.BC.r2t, nrow=2)
```

Extract UK and Canada subtrees for Sublineage 2 only
```{r}

sublineage.2.UK.meta <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$TPA.pinecone.sublineage==2 & TPA.meta1.2.pinecone$Geo_Country=="UK"),c("Cleaned_fastq_id","Sample_Name","Sample_Year")]
p.sublin.2.UK.r2t <- plot.subtree.R2T(TPA.MLtree,sublineage.2.UK.meta$Sample_Name,TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])

sublineage.2.BC.meta <- TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$TPA.pinecone.sublineage==2 & TPA.meta1.2.pinecone$Geo_Region=="British_Columbia"),c("Cleaned_fastq_id","Sample_Name","Sample_Year")]
p.sublin.2.BC.r2t <- plot.subtree.R2T(TPA.MLtree,sublineage.2.BC.meta$Sample_Name,TPA.meta1.2.pinecone.havedates[,c("Sample_Name","Sample_Year")])

plot_grid(p.sublin.2.UK.r2t, p.sublin.2.BC.r2t, nrow=2)

```

So very little temporal signal here, particularly for the UK data (only 7 years). Not realistic to analyse this way.




Look at coverage statistics 
```{r}
# For the large low cov dataset
nrow(TPA.meta1.2)
mean(as.numeric(TPA.meta1.2$Mean_mapping_coverage), na.rm=T)
max(as.numeric(TPA.meta1.2$Mean_mapping_coverage))
min(as.numeric(TPA.meta1.2$Mean_mapping_coverage))

(1-mean(as.numeric(TPA.meta1.2$`Proportion-N_>5_mapping+masking_Nichols`), na.rm=T))*100
(1-max(as.numeric(TPA.meta1.2$`Proportion-N_>5_mapping+masking_Nichols`), na.rm=T))*100
(1-min(as.numeric(TPA.meta1.2$`Proportion-N_>5_mapping+masking_Nichols`), na.rm=T))*100


# For the curated dataset
nrow(TPA.meta1.2.pinecone)
mean(as.numeric(TPA.meta1.2.pinecone$Mean_mapping_coverage), na.rm=T)
max(as.numeric(TPA.meta1.2.pinecone$Mean_mapping_coverage))
min(as.numeric(TPA.meta1.2.pinecone$Mean_mapping_coverage))

(1-mean(as.numeric(TPA.meta1.2.pinecone$`Proportion-N_>5_mapping+masking_Nichols`), na.rm=T))*100
(1-max(as.numeric(TPA.meta1.2.pinecone$`Proportion-N_>5_mapping+masking_Nichols`), na.rm=T))*100
```


```{r}
(1-min(as.numeric(TPA.meta1.2.pinecone$`Proportion-N_>5_mapping+masking_Nichols`), na.rm=T))*100
```



Quickly look at lineage association of recombination blocks
```{r}
# Define and read in recombination info
recombination_event.data <- readxl::read_excel(recombination_event.file,sheet="Recombination_Data")

# Subset to gubbins filtered regions and order by event
recombination_event.data.gubbins <- recombination_event.data[recombination_event.data$Gubbins_Event_ID!="-",]
recombination_event.data.gubbins <- recombination_event.data.gubbins[!is.na(recombination_event.data.gubbins$line_order),]
recombination_event.data.gubbins <- recombination_event.data.gubbins[order(as.numeric(recombination_event.data.gubbins$Gubbins_Event_ID)),]

# convert per-event list into a matrix
recombination_event.data.gubbins.matrix <- reshape2::dcast(reshape2::melt(strsplit(recombination_event.data.gubbins$SampleIDs,",")),L1~value)
colnames(recombination_event.data.gubbins.matrix)[1] <- "Gubbins_Event_ID"

# melt matrix into longform
recombination_event.data.gubbins.matrix.melt <- reshape2::melt(recombination_event.data.gubbins.matrix,id.vars="Gubbins_Event_ID")
# infer presence/absence
recombination_event.data.gubbins.matrix.melt$binary <- ifelse(is.na(recombination_event.data.gubbins.matrix.melt$value),0,1)
colnames(recombination_event.data.gubbins.matrix.melt) <- c("Gubbins_Event_ID","Sample_Name","value","recomb.present")

# Make a binary presence/absence matrix organised by sample (for possible plotting with ggtree)
recombination_event.data.gubbins.matrix.binary <- reshape2::dcast(recombination_event.data.gubbins.matrix.melt[,c("Gubbins_Event_ID","Sample_Name","recomb.present")], Sample_Name~Gubbins_Event_ID)


# Merge in Lineage and Pinecone information
recombination_event.data.gubbins.matrix.melt <- plyr::join(recombination_event.data.gubbins.matrix.melt[c("Gubbins_Event_ID","Sample_Name","recomb.present")],TPA.meta1.2.pinecone[,c("Sample_Name","TPA.pinecone.sublineage","TPA_Lineage")], by="Sample_Name")


recombination_event.by.Lineage <- recombination_event.data.gubbins.matrix.melt %>% dplyr::group_by(Gubbins_Event_ID, TPA_Lineage) %>% 
  dplyr::summarise(Sum=sum(recomb.present))
#recombination_event.by.Lineage[recombination_event.by.Lineage$Sum!=0,]

recombination_event.by.sublineage <- recombination_event.data.gubbins.matrix.melt %>% dplyr::group_by(Gubbins_Event_ID, TPA.pinecone.sublineage) %>% 
  dplyr::summarise(Sum=sum(recomb.present))
recombination_event.by.sublineage[recombination_event.by.sublineage$Sum!=0,]


```

```{r}
recombination_event.data.gubbins.matrix.melt <- plyr::join(recombination_event.data.gubbins.matrix.melt, recombination_event.data.gubbins[,c("Gubbins_Event_ID","Block_Start","Block_End")], by="Gubbins_Event_ID")

recombiplot.tip.order <- data.frame(Sample_Name=get_taxa_name(ggtree(TPA.MLtree)),stringsAsFactors = F) 
recombiplot.tip.order$order <- c(1:nrow(recombiplot.tip.order))
recombiplot.tip.order$order2 <- rev(c(1:nrow(recombiplot.tip.order)))
recombiplot.tip.order <- plyr::join(recombiplot.tip.order,recombination_event.data.gubbins.matrix.melt, by="Sample_Name", type="full")


p.recombi.plot <- ggplot(recombiplot.tip.order) +
  geom_rect(aes(ymin=order2-0.5,ymax=order2+0.5, xmin=as.numeric(Block_Start), xmax=as.numeric(Block_End)), alpha=0.5) +
  coord_cartesian(xlim=c(1,1139569)) +
  theme_minimal()

#TPA.MLtree.ggtree.tippoint


#ggtree(TPA.MLtree) + theme(legend.position="none")
plot_grid(ggtree(TPA.MLtree), p.recombi.plot, align=T, axis="tb", rel_widths = c(1,2))
```

```{r}

TPA.MLtree.testplot <- ggtree(TPA.MLtree)
#facet_plot(TPA.MLtree.testplot, panel="Genome_Blocks", data=recombiplot.tip.order, geom_rect, mapping=aes(xmin=Block_Start, xmax=Block_End, fill=TPA_Lineage))

```

```{r}
recombining.genes <- c("TPASS_RS00040", "TPASS_RS00045", "TPASS_RS00590", "TPASS_RS00675", "TPASS_RS01555", "TPASS_RS01565", "TPASS_RS01570", "TPASS_RS02125", "TPASS_RS02290", "TPASS_RS02700", "TPASS_RS03020", "TPASS_RS03070", "TPASS_RS03075", "TPASS_RS04240", "TPASS_RS04245", "TPASS_RS04250", "TPASS_RS04275", "TPASS_RS05385", "TPASS_RS05110", "TPASS_RS05210,TPASS_RS00685,TPASS_RS00705,TPASS_RS00690,TPASS_RS00680,TPASS_RS00700,TPASS_RS00675,TPASS_RS00665,TPASS_RS00670", "TPASS_RS00705,TPASS_RS00685,TPASS_RS00690,TPASS_RS00700", "TPASS_RS00705", "TPASS_RS00705", "TPASS_RS00705", "TPASS_RS00905", "TPASS_RS01600", "TPASS_RS01600", "TPASS_RS02265", "TPASS_RS02265", "TPASS_RS02265", "TPASS_RS02350,TPASS_RS02355", "TPASS_RS02375", "TPASS_RS02525", "TPASS_RS02760,TPASS_RS02755", "TPASS_RS03055,TPASS_RS03065,TPASS_RS03060", "TPASS_RS03065", "TPASS_RS04780,TPASS_RS04790,TPASS_RS04785,TPASS_RS04775", "TPASS_RS05100,TPASS_RS05105")


recombining.genes <- c("TPASS_RS05210,TPASS_RS00685,TPASS_RS00705,TPASS_RS00690,TPASS_RS00680,TPASS_RS00700,TPASS_RS00675,TPASS_RS00665,TPASS_RS00670", "TPASS_RS00705,TPASS_RS00685,TPASS_RS00690,TPASS_RS00700", "TPASS_RS00705", "TPASS_RS00705", "TPASS_RS00705", "TPASS_RS00905", "TPASS_RS01600", "TPASS_RS01600", "TPASS_RS02265", "TPASS_RS02265", "TPASS_RS02265", "TPASS_RS02350,TPASS_RS02355", "TPASS_RS02375", "TPASS_RS02525", "TPASS_RS02760,TPASS_RS02755", "TPASS_RS03055,TPASS_RS03065,TPASS_RS03060", "TPASS_RS03065", "TPASS_RS04780,TPASS_RS04790,TPASS_RS04785,TPASS_RS04775", "TPASS_RS05100,TPASS_RS05105")

unique(unlist(strsplit(recombining.genes,",")))


```


Tip date randomisation using TipDatingBEAST package
```{r}
library(TipDatingBeast)

# setwd("/Users/mb29/Treponema/Expanded_Global_Sequencing/Analysis/Global_TPA_uber-analysis_2020_v2/phylo/good_cov_dataset/gubbins_2.4.1__20-11-10/beast/beast2_full-tree_2020-11-25/beauti/tip_randomisation/")

#setwd("/Users/mb29/Treponema/Expanded_Global_Sequencing/Analysis/Global_TPA_uber-analysis_2020_v2/phylo/good_cov_dataset/gubbins_2.4.1__20-11-10/beast/beast2_full-tree_2020-11-25/beauti/tip_randomisation/rerun_MEL-1_01-03-2021/")

original.xml.file <- "beast2_Strict-Skyline_Full500M_+sites_1"

# Generate tipdate-randomised xml files from original BEAST2 xml
#TipDatingBeast::RandomDates(name="beast2_Strict-Skyline_Full500M_+sites_1",reps=20)

# All datasets run in BEAST2. 

# After 20 runs (took ~2 weeks) ensure all files are correctly labelled and pull in data
#TipDatingBeast::PlotDRT(name="beast2_Strict-Skyline_Full500M_+sites_1",reps=20,burnin=0.1)

# That plot isn't very nice, but the package also generates a csv file containing the values of interest that we can plot

randomtip.summary <- read.csv(random.tip.summary.file,header=T)

randomtip.summary$Data <- ifelse(randomtip.summary$calibr==0,"Real","Randomised")

p.tipdaterandomisation.normal <- ggplot(randomtip.summary) +
  geom_pointrange(aes(x=calibr, y=median,ymin = lowerHPD, ymax = HigherHPD,color=Data)) +
  theme_light() + theme(legend.position="top") +
  labs(x="Replicate", y="Clock Rate") +
  geom_hline(yintercept=randomtip.summary$lowerHPD[1],alpha=0.5) +
  geom_hline(yintercept=randomtip.summary$HigherHPD[1],alpha=0.5) +
  geom_hline(yintercept=randomtip.summary$HigherHPD[18],alpha=0.5) +
  NULL
#p.tipdaterandomisation.normal

p.tipdaterandomisation.log10 <- ggplot(randomtip.summary) +
  geom_pointrange(aes(x=calibr, y=median,ymin = lowerHPD, ymax = HigherHPD,color=Data)) +
  theme_light() + 
  theme(legend.position="top") +
  #scale_y_log10() +
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
              labels = trans_format("log10", math_format(10^.x))) +
  labs(x="Replicate", y="Clock Rate") +
  geom_hline(yintercept=randomtip.summary$lowerHPD[1],alpha=0.5) +
  geom_hline(yintercept=randomtip.summary$HigherHPD[1],alpha=0.5) +
  NULL

#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure19_tipdate-randomisation.svg"), width = 550, height = 300,type="svg",units = "pt")
p.tipdaterandomisation.log10
#dev.off()

randomtip.summary[randomtip.summary$calibr==0,"median"]
max(randomtip.summary[randomtip.summary$calibr!=0,"median"])
```


Distribution of Rabbit passaged strains
```{r}

sublineage.passaged.samples <- data.frame(TPA.meta1.2.pinecone %>% dplyr::group_by(TPA.pinecone.sublineage,Direct_from_clin) %>%
  dplyr::summarise(count=n()),stringsAsFactors = F)

sublineage.passaged.samples$TPA.pinecone.sublineage <- factor(sublineage.passaged.samples$TPA.pinecone.sublineage, levels=rev(sublineages.cols.brew$sublineage))

p.sublineage.rabbit.passage <- ggplot(sublineage.passaged.samples, aes(count, TPA.pinecone.sublineage, fill=Direct_from_clin)) +
  geom_barh(stat="identity",position="fill", width=0.75) +
  theme_light() +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='bottom') +
  scale_fill_manual(name="Direct from Clinical sample\n(no rabbit passage)",values=c("grey80","grey10")) +
  labs(y="Sublineage", x="Proportion passaged")

#p.sublineage.rabbit.passage


#data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, Direct_Sequenced=TPA.meta1.2.pinecone$Direct_from_clin, stringsAsFactors=F)

p.MLtree.rabbit.passage.distros <- gheatmap(TPA.MLtree.ggtree.tippoint + theme(legend.position="none"),
               data.frame(row.names=TPA.meta1.2.pinecone$Sample_Name, `Direct from clinical`=TPA.meta1.2.pinecone$Direct_from_clin, stringsAsFactors=F), color='grey70',width=0.075,offset=0.00000725, colnames_angle=-45,colnames_offset_y=0, hjust=0,font.size=2) + 
  scale_fill_manual(name="Direct from clinical sample",values=c("grey80","grey10")) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='right')

p.sublineage.rabbit.passage.toprow <- plot_grid('',p.sublineage.rabbit.passage + theme(legend.position='none'),'',ncol=1,rel_heights=c(1,3,1))

p.rabbit.passage.distros.combination <- plot_grid(p.MLtree.rabbit.passage.distros, p.sublineage.rabbit.passage.toprow, rel_widths = c(3,1), labels=c('A','B'), label_size=11)


#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure7__MLtree+rabbitpassage_02-2021.svg"), width = 1000, height = 800,type="svg",units = "pt")
p.rabbit.passage.distros.combination
#dev.off()

```





\
\
\
\
Look at distribution of SNPs on genome\

```{r}


WGS.site.positions <- read.table(WGS.site.positions.file, stringsAsFactors = F, header=F)
colnames(WGS.site.positions) <- "position"

ggplot(WGS.site.positions, aes(position)) + 
  geom_density() +
  theme_light()

WGS.site.positions$SNP <- 1
WGS.site.positions.all <- plyr::join(WGS.site.positions, data.frame(position=c(1:1139569),stringsAsFactors=F), type="right", by="position")
WGS.site.positions.all[is.na(WGS.site.positions.all$SNP),"SNP"] <- 0


windowsize <- 1000
WGS.site.positions.all$window <- ((trunc(as.numeric(WGS.site.positions.all$position) / windowsize,0))*windowsize)
WGS.SNP.density.window <- WGS.site.positions.all %>% 
  group_by(window) %>% 
  dplyr::summarise(mean = mean(SNP), count=sum(SNP))

p.WGS.SNP.density <- ggplot(WGS.SNP.density.window, aes(window, count)) + 
  geom_point(alpha=0.5) +
  theme_light() + 
  labs(x=paste0("Genome Position (",windowsize," bp windows)"), y=paste0("Variable sites/",windowsize," bp")) +
  theme.text.size
p.WGS.SNP.density




```



# Bring macrolide resistance back in

Look at macrolide resistance
```{r}


TPA.global.compmapping.23s <- read.table(TPA.global.compmapping.23s.file, header=T, sep="\t", check.names = F, comment.char = "")


# Missing tree samples from 23s data
missing.23S <- TPA.MLtree$tip.label[TPA.MLtree$tip.label %notin% TPA.global.compmapping.23s$Sample] 
missing.23S.meta <- TPA.meta1.2.pinecone[TPA.meta1.2.pinecone$Sample_Name %in% missing.23S, "Cleaned_fastq_id"]

# Only keep relevant values
TPA.global.compmapping.23s <- TPA.global.compmapping.23s[TPA.global.compmapping.23s$Sample %in% TPA.MLtree$tip.label,]


TPA.global.compmapping.23s$Sample_Name <- TPA.global.compmapping.23s$Sample

# Evaluate alleles (again)
TPA.global.compmapping.23s$VariantPresent_A2058G_redo <- ifelse((TPA.global.compmapping.23s$ALT_A2058G=="G" & TPA.global.compmapping.23s$DP_A2058G>20 & TPA.global.compmapping.23s$AltPerc_A2058G>95),"Yes", ifelse(TPA.global.compmapping.23s$VariantPresent_A2058G=="Hetero","Uncertain",ifelse((TPA.global.compmapping.23s$ALT_A2058G=="G" & TPA.global.compmapping.23s$DP_A2058G<=20 & TPA.global.compmapping.23s$AltPerc_A2058G>95),"Uncertain","No")))

TPA.global.compmapping.23s$VariantPresent_A2059G_redo <- ifelse((TPA.global.compmapping.23s$ALT_A2059G=="G" & TPA.global.compmapping.23s$DP_A2059G>20 & TPA.global.compmapping.23s$AltPerc_A2059G>95),"Yes", ifelse(TPA.global.compmapping.23s$VariantPresent_A2059G=="Hetero","Uncertain",ifelse((TPA.global.compmapping.23s$ALT_A2059G=="G" & TPA.global.compmapping.23s$DP_A2059G<=20 & TPA.global.compmapping.23s$AltPerc_A2058G>95),"Uncertain","No")))


TPA.global.compmapping.23s$resistant <- ifelse((TPA.global.compmapping.23s$VariantPresent_A2058G_redo=="Hetero" | TPA.global.compmapping.23s$VariantPresent_A2059G_redo=="Uncertain"),"Uncertain",ifelse(TPA.global.compmapping.23s$VariantPresent_A2058G_redo=="Yes" & TPA.global.compmapping.23s$VariantPresent_A2059G_redo=="No", "A2058G", ifelse(TPA.global.compmapping.23s$VariantPresent_A2058G_redo=="No" & TPA.global.compmapping.23s$VariantPresent_A2059G_redo=="Yes","A2059G", ifelse(TPA.global.compmapping.23s$VariantPresent_A2058G_redo=="No" & TPA.global.compmapping.23s$VariantPresent_A2059G_redo=="No", "Sensitive","Uncertain"))))

```

Now plot in a nice tree
```{r, fig.width=8, fig.height=8, message=FALSE, warning=FALSE}

TPA.global.compmapping.23s.p <- data.frame(row.names=TPA.global.compmapping.23s$Sample, A2058G=TPA.global.compmapping.23s$VariantPresent_A2058G_redo, A2059G=TPA.global.compmapping.23s$VariantPresent_A2059G_redo)

p.MLtree.23S.distros <- gheatmap(TPA.MLtree.ggtree.tippoint + theme(legend.position="none"),
               TPA.global.compmapping.23s.p, color='grey70',width=0.075,offset=0.00000725, colnames_angle=-45,colnames_offset_y=0, hjust=0,font.size=2) + 
  #scale_fill_manual(name="Resistance\nAllele\nPresent",values=c("grey50","grey95","black"), breaks=c("Uncertain","No","Yes")) +
  scale_fill_manual(name="Resistance\nAllele\nPresent",values=c("black","grey95","grey50"), breaks=c("Yes","No","Uncertain")) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='left')

p.MLtree.23S.distros
```




Look at sublineage distribution
```{r, fig.width=6, fig.height=6, message=FALSE, warning=FALSE}
TPA.meta1.2.pinecone.23S <- plyr::join(TPA.meta1.2.pinecone,data.frame(Sample_Name=TPA.global.compmapping.23s$Sample, A2058G=TPA.global.compmapping.23s$VariantPresent_A2058G_redo, A2059G=TPA.global.compmapping.23s$VariantPresent_A2059G_redo), by="Sample_Name")


TPA.meta1.2.pinecone.23S.counts <- data.frame(TPA.meta1.2.pinecone.23S %>% 
                                                dplyr::group_by(TPA.pinecone.sublineage, A2058G, A2059G) %>%
                                                dplyr::summarise(Count=n()),stringsAsFactors = F)
TPA.meta1.2.pinecone.23S.counts <- reshape2::melt(TPA.meta1.2.pinecone.23S.counts,id.vars=c("TPA.pinecone.sublineage","Count"))


TPA.meta1.2.pinecone.23S.counts$TPA.pinecone.sublineage <- factor(TPA.meta1.2.pinecone.23S.counts$TPA.pinecone.sublineage, levels=rev(sublineages.cols.brew$sublineage))

#TPA.meta1.2.pinecone.23S.counts <- TPA.meta1.2.pinecone.23S.counts[TPA.meta1.2.pinecone.23S.counts$TPA.pinecone.sublineage!="Singleton",]

TPA.meta1.2.pinecone.23S.counts$value <- ifelse(TPA.meta1.2.pinecone.23S.counts$value=="Hetero", "Uncertain",TPA.meta1.2.pinecone.23S.counts$value)

# Plot SNPs by sublineage
p.sublineage.23S.compmap <- ggplot(TPA.meta1.2.pinecone.23S.counts, aes(Count, TPA.pinecone.sublineage, fill=value, color=NULL)) +
  geom_barh(stat="identity", position="fill",width=0.75) + 
  facet_grid(.~variable) + 
  theme_light() +
  scale_fill_manual(values=c("black","grey90","grey50"), breaks=c("Yes","No","Uncertain")) +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.x=element_text(color = "grey25",angle=0, size=10)) +
  labs(x="Proportion of samples",y="Sublineage", fill="Resistance\nallele\npresent") +
  theme.text.size + theme(legend.key.size = unit(0.75,"line"),legend.position='bottom') 
#p.sublineage.23S.compmap



# Want to include Singletons for this analyis  - Redo country sublineage distros
sublineage.country.counts.incSing <- plyr::join(TPA.meta1.2.pinecone[(TPA.meta1.2.pinecone$Sample_Year!="-"),c("Sample_Name","TPA.pinecone.sublineage")], sublineage.classification, by="TPA.pinecone.sublineage",type="left")
sublineage.country.counts.incSing[is.na(sublineage.country.counts.incSing$private.distro),"private.distro"] <- "Singleton"
sublineage.country.counts.incSing <- sublineage.country.counts.incSing %>% group_by(TPA.pinecone.sublineage, private.distro) %>% summarise(Count=n())
sublineage.country.counts.incSing$TPA.pinecone.sublineage <- factor(sublineage.country.counts.incSing$TPA.pinecone.sublineage,levels=rev(sublineages.cols.brew$sublineage))

p.sublineage.private.hbarplot.incSing <- ggplot(sublineage.country.counts.incSing, aes(Count,TPA.pinecone.sublineage,fill=private.distro)) +
  geom_barh(stat="identity", position="stack", width=0.75) +
  theme_light() +
  #scale_x_log10() +
  coord_cartesian(xlim=c(0,410)) +
  theme.text.size + theme(legend.key.size = unit(0.65,"line"),legend.position='bottom') +
  scale_fill_manual(breaks=(unique(sublineage.country.counts.incSing$private.distro)), values=rev(c("grey80","grey50","grey10"))) +
  labs(y="Sublineage", x="Sample Count", fill="Sublineage\nType") +
  geom_text(data=sublineage.country.counts.incSing, aes((Count+20), TPA.pinecone.sublineage,label=Count), size=2.5, inherit.aes = F)
#p.sublineage.private.hbarplot.incSing


#plot_grid(p.sublineage.hbarplot, p.sublineage.23S.compmap + y.theme.strip, align='h', axis='tb', rel_widths =c(1,2))
p.sublineage.23S.compmap.distributions <- plot_grid(p.sublineage.private.hbarplot.incSing, p.sublineage.23S.compmap + y.theme.strip, align='h', axis='tb', rel_widths =c(1,1))


p.sublineage.23S.compmap.distributions
```

Plot distros with tree

```{r, fig.width=8, fig.height=10, message=FALSE, warning=FALSE}
plot.MLtree.23s.with.sublineage.distros.combo <- plot_grid(p.MLtree.23S.distros, p.sublineage.23S.compmap.distributions, ncol=1, rel_heights=c(3,2), labels=c('A','B'), label_size=11)

#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure14_Sublineage_vs_macrolide-distros__02-2021.svg"), width = 800, height = 1000,type="svg",units = "pt")
plot.MLtree.23s.with.sublineage.distros.combo
#dev.off()
```






Look at temporal distribution of samples
```{r, fig.width=10, fig.height=8, message=FALSE, warning=FALSE}


TPA.meta1.2.pinecone.23s.simpledates <- plyr::join(TPA.meta1.2.pinecone[,c("Sample_Name","TPA.pinecone.sublineage","Sample_Year")],TPA.global.compmapping.23s, by="Sample_Name")

TPA.meta1.2.pinecone.23s.simpledates <- data.frame(TPA.meta1.2.pinecone.23s.simpledates %>% group_by(resistant,TPA.pinecone.sublineage,Sample_Year) %>%
  summarise(Count=n()), stringsAsFactors = F)


TPA.meta1.2.pinecone.23s.simpledates <- TPA.meta1.2.pinecone.23s.simpledates[TPA.meta1.2.pinecone.23s.simpledates$TPA.pinecone.sublineage!="Singleton",]

TPA.meta1.2.pinecone.23s.simpledates <- plyr::join(data.frame(Sample_Year=c(1912:2019),stringsAsFactors=F), TPA.meta1.2.pinecone.23s.simpledates)
TPA.meta1.2.pinecone.23s.simpledates <- TPA.meta1.2.pinecone.23s.simpledates[!is.na(TPA.meta1.2.pinecone.23s.simpledates$TPA.pinecone.sublineage),]
TPA.meta1.2.pinecone.23s.simpledates <- TPA.meta1.2.pinecone.23s.simpledates[!is.na(TPA.meta1.2.pinecone.23s.simpledates$resistant),]



TPA.meta1.2.pinecone.23s.simpledates$TPA.pinecone.sublineage <- factor(TPA.meta1.2.pinecone.23s.simpledates$TPA.pinecone.sublineage, levels=(sublineages.cols.brew$sublineage))


#ggplot(TPA.meta1.2.pinecone.23s.simpledates, aes(Sample_Year, TPA.pinecone.sublineage, color=resistant, size=Count)) +
#  geom_point()

#ggplot(TPA.meta1.2.pinecone.23s.simpledates, aes(Sample_Year, TPA.pinecone.sublineage, color=resistant, size=Count)) +
#  geom_jitter()


p.bubbleplot.sublineage.resistance.alleles <- ggplot(TPA.meta1.2.pinecone.23s.simpledates, aes(Sample_Year, resistant, color=resistant)) +
  geom_point(alpha=0.65, aes(size=Count)) + 
  theme_light() +
  facet_wrap(TPA.pinecone.sublineage~.) +
  coord_cartesian(xlim=c(1970,2020)) + 
  scale_x_continuous(breaks=seq(1970,2020,20)) +
  scale_size_area(max_size = 8,breaks=c(1,5,10,25,50,75,100)) +
  theme(strip.background = element_rect(color='white', fill='white',linetype="solid"), strip.text.x=element_text(color = "grey25",angle=0, size=10)) +
  theme.text.size + theme(legend.key.size = unit(0.75,"line"),legend.position='top') +
  #scale_color_manual(values=c("black","black","grey90","grey50"), breaks=c("A2058G","A2059G","Sensitive","Uncertain"))
  scale_color_manual(name="Resistance\nAllele", values=c("red","blue","black","grey75"), breaks=c("A2058G","A2059G","Sensitive","Uncertain")) +
  geom_vline(xintercept = 2000, color='blue', alpha=0.25) +
  labs(x="Sample Year", y="Resistance Allele")
  
p.bubbleplot.sublineage.resistance.alleles



```

# Look at genetic distance v.s. geographic distance (need to infer geographic distances between samples)
### Since specific within-country gps data is limited or unavailable, will infer geographic distance between country centroids (already inferred above for map) - crude but may provide some insights. 
Plot together
```{r, fig.width=12, fig.height=8, message=FALSE, warning=FALSE}
TPA.alignment.data.dist.melt.meta2 <- plyr::join(TPA.alignment.data.dist.melt.meta,data.frame(Geo_Country.t1=country.coords.subset$Geo_Country, Long1=country.coords.subset$centroid.lon, Lat1=country.coords.subset$centroid.lat, stringsAsFactors = F), type="left", by="Geo_Country.t1")
TPA.alignment.data.dist.melt.meta2 <- plyr::join(TPA.alignment.data.dist.melt.meta2,data.frame(Geo_Country.t2=country.coords.subset$Geo_Country, Long2=country.coords.subset$centroid.lon, Lat2=country.coords.subset$centroid.lat, stringsAsFactors = F), type="left", by="Geo_Country.t2")

# Use geosphere package (distVincentyEllipsoid) to calculate geographic distance between points in km
TPA.alignment.data.dist.melt.meta2$Geographic.Distance <- geosphere::distVincentyEllipsoid(TPA.alignment.data.dist.melt.meta2[,c("Long1","Lat1")],TPA.alignment.data.dist.melt.meta2[,c("Long2","Lat2")])/1000

```


Now plot
```{r}
# Genetic Distance v.s. Geographic Distance (same Lineages)
p.geographic.vs.genetic.distance.hex.Lineage <- ggplot(TPA.alignment.data.dist.melt.meta2[TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same",], aes(Geographic.Distance,Distance)) +
#ggplot(TPA.alignment.data.dist.melt.meta2, aes(Geographic.Distance,Distance)) + 
  stat_bin_hex(colour="white", na.rm=TRUE, bins = 20) +
  scale_fill_gradientn(colours=c("purple","green"), 
                       name = "Comparison Frequency", breaks=c(3,100,3000),
                       na.value=NA, trans="log10") + 
  theme_light() +
  scale_y_continuous(breaks=seq(0,250,10)) +
  labs(y="Pairwise genetic distance (SNPs)", x="Pairwise geographic distance (kilometers)") +
  theme.text.size + theme(legend.key.size = unit(0.75,"line")) +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.x=element_text(color="grey25", size=10), strip.text.y=element_text(color="grey25", size=10)) +
  theme(legend.position="bottom") +
  #ggtitle("Pairwise SNPs (same sublineage) and Years within and between British Columbia (Canada) and England (UK)") + theme(plot.title = element_text(size = 10)) +
  #facet_grid(TPA_Lineage.t1~same.continent) +
  #facet_grid(.~same.continent) +
  facet_grid(.~TPA_Lineage.t1) +
  NULL
p.geographic.vs.genetic.distance.hex.Lineage <- p.geographic.vs.genetic.distance.hex.Lineage + stat_smooth(method='lm', fullrange=F,se=T, color='black', level=95)

p.geographic.vs.genetic.distance.hex.Lineage
```

Calculate Pearson's correlation (for real dataset)
```{r}
# For whole dataset (but only looking within same Lineage)
real.correlation1 <- cor(TPA.alignment.data.dist.melt.meta2[TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same","Distance"],TPA.alignment.data.dist.melt.meta2[TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same","Geographic.Distance"])
real.correlation1
nrow(TPA.alignment.data.dist.melt.meta2[TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same",])


# Explicitly By Lineage

# Nichols
real.correlation1.Nichols <- cor(TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta2$TPA_Lineage.t1=="Nichols"),"Distance"],TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta2$TPA_Lineage.t1=="Nichols"),"Geographic.Distance"])
real.correlation1.Nichols
nrow(TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta2$TPA_Lineage.t1=="Nichols"),])

# SS14
real.correlation1.SS14 <- cor(TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta2$TPA_Lineage.t1=="SS14"),"Distance"],TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta2$TPA_Lineage.t1=="SS14"),"Geographic.Distance"])
real.correlation1.SS14
nrow(TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta2$TPA_Lineage.t1=="SS14"),])
```



```{r}

# Define function to calculate correlations
calculate.genetic.vs.geographic.distance.correlation <- function(input.distances){
  set.seed(12345)
  bootstrap.count <- 1000
  bootstrap.correlation1 <- NULL
  # Correlation for real dataset
  real.correlation1 <- cor(input.distances[,"Distance"],input.distances[,"Geographic.Distance"])
  # Generate 1000 bootstraps and test correlation
  for (bootstrap in 1:bootstrap.count){ 
    sample1 <- data.frame(Distance=sample(input.distances[,"Distance"]), Geographic.Distance=input.distances[,"Geographic.Distance"], replace=T)
    bootstrap.correlation1 <- c(bootstrap.correlation1, cor(sample1[,1], sample1[,2]))
  }
  bootstrap.correlation1 <- data.frame(Correlation=bootstrap.correlation1, type="Bootstrap",stringsAsFactors=F)
  bootstrap.correlation1 <- rbind(data.frame(Correlation=real.correlation1, type="Real", stringsAsFactors=F),bootstrap.correlation1)
  # Calculate P value: (1+sum(s >= s0))/(N+1) - put it up against all rows for coding simplicity, but only applies to REAL data.
  bootstrap.correlation1$pval <- (1+sum(bootstrap.correlation1[bootstrap.correlation1$type=="Bootstrap","Correlation"] >= bootstrap.correlation1[bootstrap.correlation1$type=="Real","Correlation"]))/(nrow(bootstrap.correlation1[bootstrap.correlation1$type=="Bootstrap",]) +1)
  # Adjust to minimum sensitivity of method (in case of zero)
  bootstrap.correlation1$pval <- ifelse(bootstrap.correlation1$pval==0, 1/bootstrap.count,bootstrap.correlation1$pval)
  bootstrap.correlation1[bootstrap.correlation1$type=="Bootstrap","pval"] <- NA
  bootstrap.correlation1$Correlation2 <- ifelse(bootstrap.correlation1$type=="Bootstrap",NA, bootstrap.correlation1$Correlation)
  bootstrap.correlation1$dataset.length <- nrow(input.distances)
  return(bootstrap.correlation1)
}

#calculate.genetic.vs.geographic.distance.correlation(input.distances.df)
gen.geo.dist.Lineage <- calculate.genetic.vs.geographic.distance.correlation(TPA.alignment.data.dist.melt.meta2[TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same",])

ggplot(gen.geo.dist.Lineage, aes(type, Correlation, color=type)) + 
  geom_boxplot() + 
  theme_light() +
  #scale_y_log10() +
#  geom_text(aes(type, y=0.002, label=paste0("p<",round(pval[1],5)), group=type)) +
  geom_text(aes(type, y=0.002, label=paste0("Correlation=",Correlation2, group=type))) +
  #geom_text(data=gen.geo.dist.Lineage[gen.geo.dist.Lineage$type=="Real",], aes(type, y=0.002, label=paste0("p<",round(gen.geo.dist.Lineage$pval[1],5))), inherit.aes = F) +
  #geom_text(data=gen.geo.dist.Lineage[gen.geo.dist.Lineage$type=="Real",], aes(type, y=0.005, label=paste0("Correlation=",round(gen.geo.dist.Lineage$Correlation[1],5))), inherit.aes = F) +
  NULL

```

So looking at the full dataset
```{r}
# Accross whole dataset
Correlation.gen.v.geo.dist.all.samples <- calculate.genetic.vs.geographic.distance.correlation(TPA.alignment.data.dist.melt.meta2)
Correlation.gen.v.geo.dist.all.samples[1,]

# Accross whole dataset, but constrained to genetic distances within lineage
Correlation.gen.v.geo.dist.all.samples.Lineage <- calculate.genetic.vs.geographic.distance.correlation(TPA.alignment.data.dist.melt.meta2[TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same",])
Correlation.gen.v.geo.dist.all.samples.Lineage[1,]

# Explicitly By Lineage
Correlation.gen.v.geo.dist.Nichols.Lineage <- calculate.genetic.vs.geographic.distance.correlation(TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta2$TPA_Lineage.t1=="Nichols"),])
Correlation.gen.v.geo.dist.Nichols.Lineage[1,]

Correlation.gen.v.geo.dist.SS14.Lineage <- calculate.genetic.vs.geographic.distance.correlation(TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.majorlineage=="same" & TPA.alignment.data.dist.melt.meta2$TPA_Lineage.t1=="SS14"),])
Correlation.gen.v.geo.dist.SS14.Lineage[1,]


```




Now do it within major sublineages)
```{r}
my.correlation.sublin.out <- NULL
for (sublin in c(1,2,8,14)) {
  my.correlation1 <- calculate.genetic.vs.geographic.distance.correlation(TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.Pinecone.cluster=="same" & TPA.alignment.data.dist.melt.meta2$TPA.pinecone.sublineage.t1==sublin),])
  my.correlation1$sublineage <- sublin
  my.correlation.sublin.out <- rbind(my.correlation.sublin.out,my.correlation1)
}

my.correlation.sublin.out[my.correlation.sublin.out$type=="Real",]

my.correlation.sublin.out.cor.plot <- ggplot(my.correlation.sublin.out, aes(type, Correlation, color=type)) + 
  geom_boxplot() + 
  theme_light() +
  #scale_y_log10() +
  facet_wrap(vars(sublineage)) +
  #geom_text(aes(type, y=0.002, label=paste0("p<",round(pval[1],5)), group=type)) +
  geom_text(aes(type, y=0.02, label=paste0("Correlation: ",round(Correlation2,5)), group=type)) +
#  geom_text(aes(type, y=0.001, label=paste0("p<",round(my.correlation.sublin.out$pval[1],5))), inherit.aes = F) +
#  geom_text(data=my.correlation.sublin.out[my.correlation.sublin.out$type=="Real",], aes(type, y=0.005, label=paste0("Correlation=",round(my.correlation.sublin.out$Correlation[1],5))), inherit.aes = F) +
#  geom_text(data=my.correlation.sublin.out[my.correlation.sublin.out$type=="Real",], aes(type, y=0.001, label=paste0("p<",round(my.correlation.sublin.out$pval[1],5))), inherit.aes = F) +
  theme.text.size + theme(legend.key.size = unit(0.75,"line")) +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.x=element_text(color="grey25", size=10), strip.text.y=element_text(color="grey25", size=10)) +
  theme(legend.position="bottom") +
  ggtitle("Correlation of Pairwise genetic and geographical distance within major sublineages") + theme(plot.title = element_text(size = 10))

my.correlation.sublin.out.cor.plot 

```






```{r}
# Genetic Distance v.s. Geographic Distance (same sublineages)
TPA.alignment.data.dist.melt.meta2$TPA.pinecone.sublineage.t1 <- factor(TPA.alignment.data.dist.melt.meta2$TPA.pinecone.sublineage.t1, levels=sublineages.cols.brew$sublineage)


sublineage_names <- c(`1`="Sublineage 1",`2`="Sublineage 2",`8`="Sublineage 8",`14`="Sublineage 14")


p.geographic.vs.genetic.distance.hex.sublin <- ggplot(TPA.alignment.data.dist.melt.meta2[(TPA.alignment.data.dist.melt.meta2$same.TPA.Pinecone.cluster=="same" & TPA.alignment.data.dist.melt.meta2$TPA.pinecone.sublineage.t1 %in% c(1,2,8,14)),], aes(Geographic.Distance,Distance)) +
#ggplot(TPA.alignment.data.dist.melt.meta2, aes(Geographic.Distance,Distance)) + 
  stat_bin_hex(colour="white", na.rm=TRUE, bins = 20) +
  #geom_density_2d_filled() + scale_fill_brewer(palette="PuRd") +
  scale_fill_gradientn(colours=c("purple","green"), 
                       name = "Comparison Frequency", breaks=c(3,100,3000),
                       na.value=NA, trans="log10") + 
  theme_light() +
  scale_y_continuous(breaks=seq(0,250,10)) +
  labs(y="Pairwise genetic distance (SNPs)", x="Pairwise geographic distance (kilometers)") +
  theme.text.size + theme(legend.key.size = unit(0.75,"line")) +
  theme(strip.background = element_rect(fill='white',linetype="solid"), strip.text.x=element_text(color="grey25", size=10), strip.text.y=element_text(color="grey25", size=10)) +
  theme(legend.position="bottom") +
  #ggtitle("Pairwise SNPs (same sublineage) and Years within and between British Columbia (Canada) and England (UK)") + theme(plot.title = element_text(size = 10)) +
  facet_wrap(vars(TPA.pinecone.sublineage.t1), labeller=as_labeller(sublineage_names)) +
  stat_smooth(method='lm', fullrange=F,se=T, color='black', level=95) +
  NULL


p.geographic.vs.genetic.distance.hex.sublin 



```

Combine lineage and sublineage distance plots
```{r, fig.width=8, fig.height=8, message=FALSE, warning=FALSE}
plot.geographic.distance.within.Lin.Sublin <- plot_grid(p.geographic.vs.genetic.distance.hex.Lineage, p.geographic.vs.genetic.distance.hex.sublin + theme(legend.position='none'), ncol=1, rel_heights=c(3,4), labels=c('A','B'), label_size=11) 


#Cairo::Cairo(file=paste0(Figure_output_directory, "Supplementary_Figure12_Genetic-vs-Geographic_distance__02-2021.svg"), width = 600, height = 600,type="svg",units = "pt")
plot.geographic.distance.within.Lin.Sublin
#dev.off()

```

# Create Supplementary Metadata output file

```{r}
#colnames(TPA.meta1.2)
TPA.meta1.2.final.Supplementary <- TPA.meta1.2[,c("Sample_Name", "Sanger_Lane_ID_raw", "Cleaned_fastq_id", "Cleaned_fastq_readcount","SRR/ENA_Accession","Reads_or_assemblies","Species","Sample_Year","Citation","Sample_Type","Clinical","Direct_from_clin","Duplicate","Geo_Region","Geo_Country","Continent","TPA_Lineage","Proportion-N_>5_mapping+masking_Nichols","Mapping_Good<25%N","Mapping_Terrible>75%N","Mean_mapping_coverage")]

# Apend info about which samples were used in the finescale clustering analysis
#colnames(TPA.meta1.2.pinecone)
TPA.meta1.2.final.Supplementary <- plyr::join(TPA.meta1.2.final.Supplementary,TPA.meta1.2.pinecone[,c("Sample_Name","TPA.pinecone.sublineage")], by="Sample_Name", type="full")
TPA.meta1.2.final.Supplementary$finescale.analysis <- ifelse(is.na(TPA.meta1.2.final.Supplementary$TPA.pinecone.sublineage),"No","Yes")

# Append info about which samples were used in the temporal BEAST analysis
in.beast.tree <- data.frame(Sample_Name=full.beast2.tipnames$meta.name,stringsAsFactors=F)
in.beast.tree$full.temporal.analysis <- "Yes"
TPA.meta1.2.final.Supplementary <- plyr::join(TPA.meta1.2.final.Supplementary, in.beast.tree, by="Sample_Name", type="full")
TPA.meta1.2.final.Supplementary$full.temporal.analysis <- ifelse(is.na(TPA.meta1.2.final.Supplementary$full.temporal.analysis),"No","Yes")

# Append antimicrobial resistance information
TPA.meta1.2.final.Supplementary <- plyr::join(TPA.meta1.2.final.Supplementary, data.frame(Sample_Name=TPA.global.compmapping.23s$Sample, A2058G=TPA.global.compmapping.23s$VariantPresent_A2058G_redo, A2059G=TPA.global.compmapping.23s$VariantPresent_A2059G_redo, stringsAsFactors=F), by="Sample_Name", type="full")

# Relabel citation for novel sequences

TPA.meta1.2.final.Supplementary[TPA.meta1.2.final.Supplementary$Citation=="unpublished-WSI","Citation"] <- "This_Study"
TPA.meta1.2.final.Supplementary[TPA.meta1.2.final.Supplementary$Citation=="unpublished-Taiaroa","Citation"] <- "This_Study"


# Reorder dataframe by Citation
TPA.meta1.2.final.Supplementary <- TPA.meta1.2.final.Supplementary[order(TPA.meta1.2.final.Supplementary$Citation,TPA.meta1.2.final.Supplementary$Sample_Name),]


#write.csv(TPA.meta1.2.final.Supplementary, file=paste0(Figure_output_directory, "Supplementary_Data1_Sample-Metadata__03-2021.csv"), row.names = F)

```


ENA assembly submissions (high quality assemblies - assemblies not actually used for paper, but will publish separately for the community)
```{r}

#ENA.assemblies <- TPA.meta1.2[(TPA.meta1.2$`CheckM>95%_Completeness`=='yes' & TPA.meta1.2$`CheckM<5%-contamination`=='yes' & TPA.meta1.2$`contigs_<600`=='yes' & TPA.meta1.2$`Mapping_Good<25%N`=='Yes' & TPA.meta1.2$Citation=="unpublished-WSI"),c("Sample_Name", "Sanger_Lane_ID_raw", "Cleaned_fastq_id", "Cleaned_fastq_readcount","SRR/ENA_Accession","Reads_or_assemblies","Species","Sample_Year","Citation","Sample_Type","Clinical","Direct_from_clin","Duplicate","Geo_Region","Geo_Country","Continent","TPA_Lineage","Proportion-N_>5_mapping+masking_Nichols","Mapping_Good<25%N","Mapping_Terrible>75%N","Mean_mapping_coverage","SPAdes-pilon_assembly_id","Assembly_#_contigs","Assembly_N50","CheckM-completeness","CheckM-contamination")]


#ENA.assemblies <- ENA.assemblies[ENA.assemblies$`Assembly_#_contigs`<250,]
#ENA.assemblies <- ENA.assemblies[ENA.assemblies$`CheckM-contamination`<2.5,]
#ENA.assemblies <- ENA.assemblies[!is.na(ENA.assemblies$Sample_Name),]


c("NL12", "PHE120006A", "PHE120007A", "PHE120009B", "PHE120011A", "PHE120021A", "PHE120024A", "PHE130036A", "PHE130039A", "PHE130040A", "PHE130043A", "PHE130045A", "PHE130047A", "PHE130052A", "PHE130053A", "PHE130064A", "PHE140073A", "PHE140074A", "PHE140084A", "PHE140085A", "PHE140093A", "PHE140095A", "PHE150110A", "PHE150119A", "PHE150121A", "PHE150122A", "PHE150126A", "PHE150130A", "PHE150131A", "PHE150133A", "PHE150137A", "PHE150138A", "PHE150143A", "PHE150145A", "PHE150148A", "PHE150149A", "PHE150153A", "PHE150160A", "PHE150161A", "PHE150162A", "PHE150166A", "PHE150177A", "PHE160190A", "PHE160196A", "PHE160197A", "PHE160198A", "PHE160203A", "PHE160211A", "PHE160214A", "PHE160217A", "PHE160224A", "PHE160239A", "PHE160240A", "PHE160243A", "PHE160249A", "PHE160253A", "PHE160256A", "PHE160259A", "PHE160262A", "PHE160264A", "PHE160277A", "PHE160280A", "PHE160287A", "PHE160290A", "PHE160298A", "PHE160299A", "PHE160309A", "PHE160312A", "PHE160315A", "PHE160316A", "PHE170328A", "PHE170329A", "PHE170333A", "PHE170336B", "PHE170346A", "PHE170349A", "PHE170351A", "PHE170352A", "PHE170356A", "PHE170366A", "PHE170370A", "PHE170372A", "PHE170374A", "PHE170380A", "PHE170381A", "PHE170387A", "PHE170388A", "PHE170398A", "PHE170402A", "PHE170403A", "PHE170405A", "PHE170407A", "PHE170408A", "TPA_ALC015", "TPA_ALC034", "TPA_ALC077", "TPA_BCC004", "TPA_BCC005", "TPA_BCC008", "TPA_BCC009", "TPA_BCC012", "TPA_BCC014", "TPA_BCC023", "TPA_BCC030", "TPA_BCC032", "TPA_BCC034", "TPA_BCC040", "TPA_BCC049", "TPA_BCC052", "TPA_BCC055", "TPA_BCC058", "TPA_BCC061", "TPA_BCC063", "TPA_BCC064", "TPA_BCC075", "TPA_BCC079", "TPA_BCC085", "TPA_BCC088", "TPA_BCC101", "TPA_BCC102", "TPA_BCC106", "TPA_BCC108", "TPA_BCC109", "TPA_BCC111", "TPA_BCC122", "TPA_BCC127", "TPA_BCC128", "TPA_BCC129", "TPA_BCC130", "TPA_BCC132", "TPA_BCC134", "TPA_BCC137", "TPA_BCC139", "TPA_BCC140", "TPA_BCC141", "TPA_reBCC165", "TPA_BCC166", "TPA_BCC174", "TPA_BCC175", "TPA_BCC181", "TPA_BCC185", "TPA_BCC186", "TPA_BCC187", "TPA_BCC196", "TPA_BCC197", "TPA_BCC198", "TPA_BCC199", "TPA_EIR008", "TPA_EIR013", "TPA_EIR015", "TPA_EIR017", "TPA_ESBCN005", "TPA_OMI002", "TPA_OMI015", "TPA_OMI021", "TPA_OMI075", "TPA_UKBIR026", "TPA_UKBIR028", "TPA_UKBIR044", "TPA_UKBIR052", "TPA_UKBRG004", "TPA_UKBRG008", "TPA_UKBRG010", "TPA_UKBRG012", "TPA_UKBRG017", "TPA_UKBRG018", "TPA_UKLEE004", "TPA_UKMAN003", "TPA_UKMAN019", "TPA_UKMAN027", "TPA_UKMAN047", "TPA_UKMAN054", "TPA_USL-BAL-2", "TPA_USL-BAL-6", "TPA_USL-BAL-7", "TPA_USL-BAL-8", "TPA_USL-Grady-1", "TPA_USL-Haiti-B", "TPA_USL-Phil-1", "TPA_USL-Phil-3", "TPA_USL-SEA-81-3", "TPA_USL-SEA-81-8", "TPA_USL-SEA-83-1", "TPA_USL-SEA-83-2", "TPA_USL-SEA-84-2", "TPA_USL-SEA-86-1", "TPA_USL-SEA-87-1", "TPA_ZIM005", "TPA_ZIM007", "TPA_ZIM018", "TPA_ZIM024", "TPA_ZIM025", "TPA_ZIM028", "UW202B", "TPA_ALC105", "TPA_BCC103", "TPA_OMI006", "TPA_OMI022", "TPA_OMI029", "TPA_OMI033", "TPA_ZIM019", "TPA_HUN190022", "TPA_HUN200024", "TPA_HUN190020", "TPA_RUS_Tuva-39", "TPA_RUS_Tuva-58", "TPA_RUS_Tuva-59", "TPA_RUS_Tuva-26", "TPA_RUS_Tuva-41", "TPA_SWE-996", "TPA_AUSBR-41")


```



